In [30]:
# coding: utf8
import math
import string

import nltk
import pandas
import pymorphy2
import sklearn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer


def normalize_review(review, morph):
    """

    :param review:
    :param morph:
    :return:
    """

    tokens = nltk.word_tokenize(review)
    normalized_tokens = []
    # normalized_review = ''
    for token in tokens:
        token = morph.parse(token)[0].normal_form
        if token not in stop_words and token not in string.punctuation:
            normalized_tokens.append(token.lower())

    return normalized_tokens, " ".join(normalized_tokens)


def get_my_reviews(morph):
    """
    Получаем мои ревью из эксель таблицы.

    :param morph:
    :return tuple: (название фильма, текст ревью, реальный класс ревью)
    """
    pos_reviews = []
    neg_reviews = []
    neutr_reviews = []
    
    excel_file = pandas.read_excel('Отзывы кино.xlsx', 0)
    for i, row in excel_file.iterrows():
        if (row['title'] == 'Криминальное чтиво' or
                row['title'] == 'Маленькая Мисс Счастье' or
                row['title'] == 'Амели'):
            
            if row['label'] == 1:
                pos_reviews.append((row['title'], normalize_review(row['text'], morph),
                            str(row['label'])))
            if row['label'] == -1:
                neg_reviews.append((row['title'], normalize_review(row['text'], morph),
                            str(row['label'])))
            if row['label'] == 0:
                neutr_reviews.append((row['title'], normalize_review(row['text'], morph),
                            str(row['label'])))

    return pos_reviews, neg_reviews, neutr_reviews


def get_reviews(morph):
    """
    Получаем ревью одногруппников из эксель таблицы.

    :param morph:
    :return tuple: (название фильма, список токенов, реальный класс ревью)
    """
    pos_reviews = []
    neg_reviews = []
    neutr_reviews = []
    
    excel_file = pandas.read_excel('Отзывы кино.xlsx', 0)
    for i, row in excel_file.iterrows():
        if (row['title'] != 'Криминальное чтиво' and
                row['title'] != 'Маленькая Мисс Счастье' and
                row['title'] != 'Амели'):
            if row['label'] == 1:
                pos_reviews.append((row['title'], normalize_review(row['text'], morph),
                            str(row['label'])))
            if row['label'] == -1:
                neg_reviews.append((row['title'], normalize_review(row['text'], morph),
                            str(row['label'])))
            if row['label'] == 0:
                neutr_reviews.append((row['title'], normalize_review(row['text'], morph),
                            str(row['label'])))
            

    return pos_reviews, neg_reviews, neutr_reviews


morph = pymorphy2.MorphAnalyzer()

stop_words = stopwords.words('russian')
stop_words.extend(['«', '»', '–', '...', '“', '”', '—', '!',
                   '@', '№', ':', ',', '.', '?', ':', '(', ')'])
stop_words = set(stop_words)

train_pos_reviews, train_neg_reviews, train_neutr_reviews = get_reviews(morph)
test_pos_reviews, test_neg_reviews, test_neutr_reviews = get_my_reviews(morph)

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = CountVectorizer(analyzer="word")

print('Done with parsing')

Done with parsing


In [29]:
import numpy

In [31]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

In [47]:
train_pos_reviews = [item[1][1] for item in train_pos_reviews]
train_neg_reviews = [item[1][1] for item in train_neg_reviews]
train_neutr_reviews = [item[1][1] for item in train_neutr_reviews]

test_pos_reviews = [item[1][1] for item in test_pos_reviews]
test_neg_reviews = [item[1][1] for item in test_neg_reviews]
test_neutr_reviews = [item[1][1] for item in test_neutr_reviews]


In [70]:
model = LogisticRegression(max_iter=1000)

X = train_pos_reviews + train_neg_reviews + train_neutr_reviews
y = [1]*len(train_pos_reviews) + [-1]*len(train_neg_reviews) + [0]*len(train_neutr_reviews)

X_test = test_pos_reviews + test_neg_reviews + test_neutr_reviews
y_test = [1]*len(test_pos_reviews) + [-1]*len(test_neg_reviews) + [0]*len(test_neutr_reviews)


X = vectorizer.fit_transform(X).toarray()
X_test = vectorizer.transform(X_test).toarray()

model.fit(X, y)
print(model)
print('\n')
expected = y_test
predicted = model.predict(X_test)

print(metrics.classification_report(expected, predicted))
print('\n')
print(metrics.confusion_matrix(expected, predicted))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


              precision    recall  f1-score   support

          -1       0.60      0.60      0.60        30
           0       0.35      0.20      0.26        30
           1       0.51      0.73      0.60        30

    accuracy                           0.51        90
   macro avg       0.49      0.51      0.49        90
weighted avg       0.49      0.51      0.49        90



[[18  6  6]
 [ 9  6 15]
 [ 3  5 22]]


In [76]:
model.classes_

array([-1,  0,  1])

In [110]:
weights = model.coef_[1]
vectorizer = CountVectorizer(analyzer="word")
features = vectorizer.fit_transform(train_neutr_reviews).toarray()
vocab = vectorizer.get_feature_names()
mapping = zip(weights, vocab)

mapping = sorted(mapping, key=lambda tup: tup[0])

# первые 10
print(mapping[:10])
print('\n\n')
# последние 10
print(mapping[-10:])

[(-0.4224993170831814, '118'), (-0.34832699146909, 'хитрость'), (-0.2080963483102299, 'клясться'), (-0.20236828004726676, 'отличие'), (-0.20161653838168586, 'островок'), (-0.19148154901150666, 'дикобраз'), (-0.1897950350566461, 'недоделать'), (-0.18553428673686054, 'же'), (-0.18247915943042012, 'зомби'), (-0.178737250576839, 'покровительствовать')]



[(0.19146809260685735, 'место'), (0.1921136297530562, 'сравняться'), (0.19600552446892588, 'зацепить'), (0.19849772436260812, 'книга'), (0.19869962204899005, 'уголок'), (0.23752085260320338, 'контингент'), (0.250674542941112, 'смоделировать'), (0.26170925979211257, 'переработать'), (0.27682048890864147, 'предпринять'), (0.2798529812272922, 'добиться')]


In [111]:
weights = model.coef_[0]
vectorizer = CountVectorizer(analyzer="word")
features = vectorizer.fit_transform(train_neg_reviews).toarray()
vocab = vectorizer.get_feature_names()

mapping = zip(weights, vocab)

mapping = sorted(mapping, key=lambda tup: tup[0])
# первые 10
print(mapping[:10])
print('\n\n')
# последние 10
print(mapping[-10:])

[(-0.3749677564172916, 'утверждение'), (-0.27549871986642915, 'самый'), (-0.26698152287794186, 'отнимать'), (-0.25559594146182835, 'грязь'), (-0.24218162794789744, 'операторский'), (-0.23476466005635382, 'проехать'), (-0.21883591310344186, 'час'), (-0.21808043754516085, 'дожить'), (-0.2170701940962084, 'должность'), (-0.21426853857945297, 'пыльный')]



[(0.2062403297996023, 'месиво'), (0.20741159128253583, 'неоправданность'), (0.22177988999851694, 'достать'), (0.2232030335949821, 'безработица'), (0.2245712604381366, 'упускать'), (0.22949627235922954, 'идентифицироваться'), (0.23499702690077592, 'порушить'), (0.2374573888335256, 'столкновение'), (0.25303334450916926, 'среднее'), (0.45079345334770116, 'стандарт')]


In [112]:
weights = model.coef_[2]
vectorizer = CountVectorizer(analyzer="word")
features = vectorizer.fit_transform(train_neg_reviews).toarray()
vocab = vectorizer.get_feature_names()

mapping = zip(weights, vocab)

mapping = sorted(mapping, key=lambda tup: tup[0])
# первые 10
print(mapping[:10])
print('\n\n')
# последние 10
print(mapping[-10:])

[(-0.2697829046796834, 'кинопроектор'), (-0.21248175449060733, 'извращение'), (-0.21105825090106692, 'пересолить'), (-0.20967537709708334, 'смонтировать'), (-0.209253885085023, 'достать'), (-0.20594319839055508, 'пумбой'), (-0.20430878291690305, 'среднее'), (-0.1991172572862432, 'инстинкт'), (-0.19908487350225637, 'несущий'), (-0.17391922391537365, 'худоба')]



[(0.21840259240459217, 'пощекотать'), (0.22075889001697743, 'неинтересно'), (0.22449748099714384, 'вредить'), (0.22737091341303603, 'срывать'), (0.2351558343409769, 'феллини'), (0.23539112226210188, 'суперидея'), (0.3403832002961056, 'хэнск'), (0.366021662868077, 'отнимать'), (0.3691119450139682, 'операторский'), (0.5264154902127656, '100')]
