In [24]:
from sklearn.exceptions import UndefinedMetricWarning
import warnings
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)



import numpy as np
import pandas as pd
import json

from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

from nltk.corpus import stopwords
from many_stop_words import get_stop_words



from tqdm import tqdm_notebook

from data.utils.preprocessing import Preprocessing, preprocess
from data.utils.processing import Processing


In [25]:
RESTAURANT_CATEGORIES = [
    'AMBIENCE#GENERAL',
    'DRINKS#PRICES',
    'DRINKS#QUALITY',
    'DRINKS#STYLE_OPTIONS',
    'FOOD#PRICES',
    'FOOD#QUALITY',
    'FOOD#STYLE_OPTIONS',
    'LOCATION#GENERAL',
    'RESTAURANT#GENERAL',
    'RESTAURANT#MISCELLANEOUS',
    'RESTAURANT#PRICES',
    'SERVICE#GENERAL'
]
preprocessing = Preprocessing.PREPROCESSING_1

train_batch = {}
test_batch = {}

train_sentences = TrainSentence.objects.filter(out_of_scope=False)
test_sentences = TestSentence.objects.filter(out_of_scope=False)

def get_ys(sentences, target_category):
    ys = []
    for sentence in sentences:
        y = int(any([target_category in category for category in sentence.categories]))
        ys.append(y)
    return ys

def get_sentence_batch(sentence_type, category):
    sentences = train_sentences if sentence_type == 'train' else test_sentences
    X_raw = [x.text for x in sentences]
    X_preproc = preprocess(X_raw, preprocessing)
    y = get_ys(sentences, category)
    return X_preproc, y


for idx, category in tqdm_notebook(enumerate(RESTAURANT_CATEGORIES)):
    train_batch[category] = get_sentence_batch('train', category)
    test_batch[category]  = get_sentence_batch('test', category)





# Исследование: выбор алгоритма классификации и способа векторизации

In [38]:
colors = [
    '#109618', '#FF9900', '#3366CC', '#DC3912', '#0099C6','#DD4477', '#AAAA11','#994499','#22AA99','#6633CC','#E67300','#8B0707','#329262','#5574A6','#3B3EAC'
]

In [39]:
def test(clfs, vectorizers):
    chart_data = {
        'data': {
            'labels': [vectorizer['name'] for vectorizer in vectorizers],
            'datasets': []
        },
        'options': {
            'title': {
                'text': "Классификаторы аспектов",
                'display': True,
                'fontSize': 18
            },
            'scales': {
                'yAxes': [{
                    'scaleLabel': {
                        'display': True,
                        'labelString': 'F1 macro'
                    }
                }]
            },
            'responsive': True
        }
    }
    for i, clf in tqdm_notebook(enumerate(clfs)):
        dataset = {
            'label': clf['name'],
            'data': [],
            'backgroundColor': colors[i]
        }
        for vectroizer in vectorizers:
            result = pd.DataFrame(columns=['Category', 'Precision', 'Recall', 'F1-score', 'Accuracy'])
            for idx, category in enumerate(RESTAURANT_CATEGORIES):
                category_clf = clf['clf']
                cv = vectroizer['vect']
                X_train_raw, y_train = train_batch[category]
                X_train = cv.fit_transform(X_train_raw)
                if clf['name'] in ['GaussianNB', 'QuadraticDiscriminantAnalysis']:
                    X_train = X_train.toarray()
                category_clf.fit(X_train, y_train)

                X_test_raw, y_test = test_batch[category]
                X_test = cv.transform(X_test_raw)
                if clf['name'] in ['GaussianNB', 'QuadraticDiscriminantAnalysis']:
                    X_test = X_test.toarray()
                y_pred = category_clf.predict(X_test)

                precision, recall, f1_score, _ = metrics.precision_recall_fscore_support(y_true=y_test, y_pred=y_pred, beta=1, average='binary')
                accuracy = metrics.accuracy_score(y_true=y_test, y_pred=y_pred)
                result.loc[idx] = [category, precision, recall, f1_score, accuracy]

            F1_macro = np.mean(result['F1-score'])
            accuracy_macro = np.mean(result['Accuracy'])
            dataset['data'].append(F1_macro)
        chart_data['data']['datasets'].append(dataset)
    return chart_data

In [40]:
clfs = [
    {
        'name': 'MultinomialNB',
        'clf': MultinomialNB()
    },
    {
        'name': 'GaussianNB',
        'clf': GaussianNB()
    },
    {
        'name': 'LinearSVC',
        'clf': LinearSVC(C=2)
    },
    {
        'name': 'GradientBoostingClassifier',
        'clf': GradientBoostingClassifier()
    },
    {
        'name': 'RandomForestClassifier',
        'clf': RandomForestClassifier()
    },
    {
        'name': 'AdaBoostClassifier',
        'clf': AdaBoostClassifier()
    },
    {
        'name': 'DecisionTreeClassifier',
        'clf': DecisionTreeClassifier()
    },
#     {
#         'name': 'MLPClassifier',
#         'clf': MLPClassifier(verbose=True, early_stopping=True)
#     }
]

vectorizers = [
    {
        'name': 'Булевские вектора',
        'vect': CountVectorizer(max_features=2000, binary=True)
    },
    {
        'name': 'Частотные вектора',
        'vect': CountVectorizer(max_features=2000)
    },
    {
        'name': 'Норм. частотные вектора',
        'vect': TfidfVectorizer(max_features=2000, use_idf=False)
    },
    {
        'name': 'TF_IDF',
        'vect': TfidfVectorizer(max_features=2000, use_idf=True)
    }    
]

chart_data = test(clfs, vectorizers)




In [41]:
import json
json.dumps(chart_data, ensure_ascii=False)

'{"data": {"labels": ["Булевские вектора", "Частотные вектора", "Норм. частотные вектора", "TF_IDF"], "datasets": [{"label": "MultinomialNB", "data": [0.3288839334788854, 0.33045651972277734, 0.15974806410660744, 0.17473313232732993], "backgroundColor": "#109618"}, {"label": "GaussianNB", "data": [0.17520470595590687, 0.17491302175883935, 0.1760427391292487, 0.17288566961239682], "backgroundColor": "#FF9900"}, {"label": "LinearSVC", "data": [0.5035702549322448, 0.5047294207260998, 0.4570574077126232, 0.4662173066884136], "backgroundColor": "#3366CC"}, {"label": "GradientBoostingClassifier", "data": [0.528580058221393, 0.5053061460499303, 0.417690964941767, 0.42427353218407343], "backgroundColor": "#DC3912"}, {"label": "RandomForestClassifier", "data": [0.3250845511387415, 0.3347062158537568, 0.2803789771635122, 0.2846638777675274], "backgroundColor": "#0099C6"}, {"label": "AdaBoostClassifier", "data": [0.5287385051847129, 0.5300494717023417, 0.4973329680476826, 0.4932809132803632], "ba

# Исследование: удаление стоп слов

In [42]:
## STOP_WORDS

# TEST 1
STOP_WORDS_1 = stopwords.words('russian')

# TEST 2
STOP_WORDS_2 = stopwords.words('russian')
STOP_WORDS_2.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на'])

# TEST 3
STOP_WORDS_3 = get_stop_words('ru')

In [43]:
clfs = [
    {
        'name': 'LinearSVC',
        'clf': LinearSVC(C=2)
    },
    {
        'name': 'AdaBoostClassifier',
        'clf': AdaBoostClassifier()
    },
    {
        'name': 'DecisionTreeClassifier',
        'clf': DecisionTreeClassifier()
    },
]

vectorizers = [
    {
        'name': 'Исходный текст',
        'vect': CountVectorizer(max_features=2000)
    },
    {
        'name': 'Стоп-слова nltk',
        'vect': CountVectorizer(max_features=2000, stop_words=STOP_WORDS_1)
    },
    {
        'name': 'Стоп-слова nltk + доп.',
        'vect': CountVectorizer(max_features=2000, stop_words=STOP_WORDS_2)
    },
    {
        'name': 'Стоп-слова many-stop-words',
        'vect': CountVectorizer(max_features=2000, stop_words=STOP_WORDS_3)
    },
]

colors = [
    '#3366CC', '#DD4477', '#AAAA11'
]
chart_data = test(clfs, vectorizers)




In [44]:
json.dumps(chart_data, ensure_ascii=False)

'{"data": {"labels": ["Исходный текст", "Стоп-слова nltk", "Стоп-слова nltk + доп.", "Стоп-слова many-stop-words"], "datasets": [{"label": "LinearSVC", "data": [0.5047294207260998, 0.5445120732211344, 0.5342383881144155, 0.5471825506900484], "backgroundColor": "#3366CC"}, {"label": "AdaBoostClassifier", "data": [0.5300494717023417, 0.4905900513891887, 0.4937403562345792, 0.472714212701198], "backgroundColor": "#DD4477"}, {"label": "DecisionTreeClassifier", "data": [0.5004411474030982, 0.5272488526428413, 0.5384638417837196, 0.5386589288660005], "backgroundColor": "#AAAA11"}]}, "options": {"title": {"text": "Классификаторы аспектов", "display": true, "fontSize": 18}, "scales": {"yAxes": [{"scaleLabel": {"display": true, "labelString": "F1 macro"}}]}, "responsive": true}}'

In [45]:
from pprint import pprint 

In [17]:
pprint(chart_data)

{'data': {'datasets': [{'backgroundColor': '#3366CC',
                        'data': [0.4936583867068472,
                                 0.5405516282314086,
                                 0.5390008061921042,
                                 0.5553944670725142],
                        'label': 'LinearSVC'},
                       {'backgroundColor': '#DD4477',
                        'data': [0.532457414975359,
                                 0.47105683686236594,
                                 0.47458368198815326,
                                 0.4356172763738108],
                        'label': 'AdaBoostClassifier'},
                       {'backgroundColor': '#AAAA11',
                        'data': [0.5017004147390782,
                                 0.46118093896262113,
                                 0.47384261020376517,
                                 0.49695213435482993],
                        'label': 'DecisionTreeClassifier'}],
          'labels': ['Исходный 

In [49]:
ch_data = {'data': {'datasets': [
                        {'backgroundColor': '#3366CC',
                        'data': [0.6953, 0.6693, 0.6483],
                        },
                       ],
                 },
           'labels': [
               'LSTM + комбинирование классификаторов',
               'LSTM',
               'Best Semeval 2016',
           ],
 'options': {'responsive': True,
             'scales': {'yAxes': [{'scaleLabel': {'display': True,
                                                  'labelString': 'F1 macro'}}]},
             'title': {'display': True,
                       'fontSize': 18,
                       'text': 'Классификаторы аспектов'}}}

json.dumps(ch_data, ensure_ascii=False)

'{"data": {"datasets": [{"backgroundColor": "#3366CC", "data": [0.6953, 0.6693, 0.6483]}]}, "labels": ["LSTM + комбинирование классификаторов", "LSTM", "Best Semeval 2016"], "options": {"responsive": true, "scales": {"yAxes": [{"scaleLabel": {"display": true, "labelString": "F1 macro"}}]}, "title": {"display": true, "fontSize": 18, "text": "Классификаторы аспектов"}}}'

In [7]:
result.style.bar(subset=['Precision', 'Recall', 'F1-score'], color='#5fba7d').format("{:.2}", subset=['Precision', 'Recall', 'F1-score'], )

NameError: name 'result' is not defined

In [None]:
F1_macro = np.mean(result['F1-score'])
F1_macro

In [None]:
chart_data = {
        'data': {
            'labels': [vectorizer['name'] for vectorizer in vectorizers],
            'datasets': []
        },
        'options': {
            'title': {
                'text': "Классификаторы аспектов",
                'display': True,
                'fontSize': 18
            },
            'scales': {
                'yAxes': [{
                    'scaleLabel': {
                        'display': True,
                        'labelString': 'F1 macro'
                    }
                }]
            },
            'responsive': True
        }
    }