In [2]:
# coding: utf8
import math
import string

import nltk
import pandas
import pymorphy2
from nltk.corpus import stopwords



def normalize_review(review, morph):
    """

    :param review:
    :param morph:
    :return:
    """
    
    # Токены для исключения из корпуса
#     stop_words = stopwords.words('russian')
#     stop_words.extend(['«', '»', '–', '...', '“', '”', '—', '!',
#                     '@', '№', ':', ',', '.', '?', ':', '(', ')'])
    
    stop_words = ['«', '»', '...', '“', '”', '—', '№']

    tokens = nltk.word_tokenize(review)
    normalized_tokens = []
    # normalized_review = ''
    for token in tokens:
        token = morph.parse(token)[0].normal_form
        if token not in stop_words and token not in string.punctuation:
            normalized_tokens.append(token.lower())

    return " ".join(normalized_tokens)

def get_trained_model(morph):
    """
    Обучаем модель на всех ревью из таблицы, кроме тех, что надо будет
    классифицировать.

    :param morph:
    :return model: Ключи - классы, значения - списки с нормализованными ревью,
    относящимися к соответствующему классу
    """
    model = {
        '1': [],
        '-1': [],
        '0': []
    }
    excel_file = pandas.read_excel('Отзывы кино.xlsx', 0)
    for i, row in excel_file.iterrows():
        if (row['title'] == 'Криминальное чтиво' or
                row['title'] == 'Маленькая Мисс Счастье' or
                row['title'] == 'Амели'):
            continue

        model[str(row['label'])].append(normalize_review(row['text'], morph))

    return model


def get_reviews_to_classify(morph):
    """
    Получаем ревью из эксель таблицы, которые надо классифицировать.

    :param morph:
    :return tuple: (название фильма, текст ревью, реальный класс ревью)
    """
    model = {
        '1': [],
        '-1': [],
        '0': []
    }
    excel_file = pandas.read_excel('Отзывы кино.xlsx', 0)
    for i, row in excel_file.iterrows():
        if (row['title'] == 'Криминальное чтиво' or
                row['title'] == 'Маленькая Мисс Счастье' or
                row['title'] == 'Амели'):
            model[str(row['label'])].append(normalize_review(row['text'], morph)[1])

    return model



morph = pymorphy2.MorphAnalyzer()

# Обучаем модель
trained_model = get_trained_model(morph)

# Делаем выборку из ревью для классификаций
reviews_to_classify = get_reviews_to_classify(morph)



In [3]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X_train = trained_model['0'] + trained_model['-1'] + trained_model['1']
Y_train = ['0']*len(trained_model['0']) + ['-1']*len(trained_model['-1']) + ['1']*len(trained_model['1'])

X_test = reviews_to_classify['0'] + reviews_to_classify['-1'] + reviews_to_classify['1']
Y_test = ['0']*len(reviews_to_classify['0']) + ['-1']*len(reviews_to_classify['-1']) + ['1']*len(reviews_to_classify['1'])

X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

from sklearn.naive_bayes import MultinomialNB, GaussianNB
mnb = MultinomialNB()
mnb = GaussianNB()


In [4]:
mnb.fit(X_train,Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [5]:
y_pred = mnb.predict(X_test)

In [6]:
from sklearn.metrics import confusion_matrix

print(Y_test)
print(y_pred)
confusion_matrix(y_true=Y_test, y_pred=y_pred)

['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '-1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1']
['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']


array([[ 0, 30,  0],
       [ 0, 30,  0],
       [ 0, 30,  0]], dtype=int64)

In [7]:
from sklearn import metrics
print(metrics.classification_report(Y_test, y_pred, labels=["0", "-1", "1"]))

              precision    recall  f1-score   support

           0       0.33      1.00      0.50        30
          -1       0.00      0.00      0.00        30
           1       0.00      0.00      0.00        30

    accuracy                           0.33        90
   macro avg       0.11      0.33      0.17        90
weighted avg       0.11      0.33      0.17        90



  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
print(metrics.precision_recall_fscore_support(Y_test, y_pred, average='weighted'))

(0.1111111111111111, 0.3333333333333333, 0.16666666666666666, None)


In [10]:
from sklearn.metrics import accuracy_score
print("NB Accuracy Score -> ",accuracy_score(y_pred, Y_test)*100)

NB Accuracy Score ->  33.33333333333333
