Импортируем необходимые модули

In [1]:
import pymorphy2
import pandas as pd
import xml.etree.ElementTree
import re
import requests
import numpy as np
import pickle
from pprint import pprint
from time import time

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, RidgeClassifierCV 
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline


In [2]:
def text_classifier(vectorizer, transformer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
            ("transformer", transformer),
            ("classifier", classifier)]
    )

Создадим из тестовых данных xml

In [3]:
f = open("sss.xml",'w')
f.write('<root>\n')
f2 = open('test.csv','r')
lines = f2.readlines()
for line in lines:
    f.write(line)
f2.close()
f.write('</root>')
f.close()

In [4]:
e = xml.etree.ElementTree.parse('sss.xml')

In [5]:
reviews_raw = e.findall('review')
test_reviews = np.array([x.text for x in reviews_raw])

Загружаем честно украденные у Яндекса данные

In [6]:
texts = pickle.load(open('texts.pkl','rb'))
labels = pickle.load(open('labels.pkl','rb'))

Подбираем более менее пристойную модель

In [7]:
for vctr in [CountVectorizer, TfidfVectorizer]:
    for trfr in [TfidfTransformer]:
        for clfr in [LogisticRegression, SGDClassifier, RidgeClassifier, RidgeClassifierCV]:
            print(vctr)
            print(trfr)
            print(clfr)
            print(cross_val_score(text_classifier(vctr(), trfr(), clfr()), texts, labels).mean())
            print("\n")

<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.feature_extraction.text.TfidfTransformer'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>
0.791111111111


<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.feature_extraction.text.TfidfTransformer'>
<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>
0.784444444444


<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.feature_extraction.text.TfidfTransformer'>
<class 'sklearn.linear_model.ridge.RidgeClassifier'>
0.795555555556


<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.feature_extraction.text.TfidfTransformer'>
<class 'sklearn.linear_model.ridge.RidgeClassifierCV'>
0.785555555556


<class 'sklearn.feature_extraction.text.TfidfVectorizer'>
<class 'sklearn.feature_extraction.text.TfidfTransformer'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>
0.795555555556


<class 'sklearn.feature_extraction.text

Попробуем подобрать параметры для наиболее удачной модели

In [8]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RidgeClassifierCV()),
])

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 4000, 6000, 8000),
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__cv': (None, 3),
    'clf__normalize': (True, False),
}

In [9]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(texts, labels)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
('pipeline:', ['vect', 'tfidf', 'clf'])
parameters:
{'clf__cv': (None, 3),
 'clf__normalize': (True, False),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__max_features': (None, 4000, 6000, 8000),
 'vect__ngram_range': ((1, 1), (1, 2), (1, 3))}
Fitting 3 folds for each of 576 candidates, totalling 1728 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 1728 out of 1728 | elapsed: 17.3min finished


done in 1042.771s
()
Best score: 0.834
Best parameters set:
	clf__cv: None
	clf__normalize: False
	tfidf__norm: 'l1'
	tfidf__use_idf: True
	vect__max_df: 0.75
	vect__max_features: None
	vect__ngram_range: (1, 3)


Обучаем модель

In [10]:
model = text_classifier(
    TfidfVectorizer(max_df=0.75, ngram_range=(1, 3)),
    TfidfTransformer(norm='l1'),
    RidgeClassifierCV()
)
model.fit(texts,labels)

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.75, max_features=None, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_i....0, 10.0), class_weight=None, cv=None,
         fit_intercept=True, normalize=False, scoring=None))])

Трём магический шар и получаем предсказание

In [11]:
model.predict(test_reviews)

array([ 0.,  1.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,
        1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  0.,
        1.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,  1.,
        0.,  0.,  1.,  1.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,
        0.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,
        1.,  0.,  1.,  0.,  1.,  1.,  1.,  1.,  0.])

Записываем в файл

In [12]:
t = []
for x in model.predict(test_reviews):
    if x > 0:
        t.append('pos')
    else:
        t.append('neg')
        
result = pd.DataFrame(data = t, columns=['y'])

result.to_csv('result.csv',index_label=['Id'])

kaggle 0.82 So sad

Попробуем лемматизировать наши тексты

In [14]:
def norm_word(word):
    morph = pymorphy2.MorphAnalyzer()
    p = morph.parse(word)[0]
    return p.normal_form

In [15]:
def norm_list(lst):
    words_list = [re.sub('\W', ' ', x, flags=re.UNICODE).split() for x in lst]
    reviews_norm = []
    for words in words_list:
        words_norm_list = []
        for word in words:
            nw = norm_word(word)
            words_norm_list.append(nw)
        reviews_norm.append(words_norm_list)
    result = [(' ').join(x) for x in reviews_norm]
    return result


In [16]:
%%time
test_reviews_norm = norm_list(test_reviews)

Wall time: 18min 16s


In [17]:
print(test_reviews_norm[0])

ужасно слабый аккумулятор это основной минус это аппарат разряжаться буквально за пара часы при включить wifi и на макс подсветка например если играть или смотреть видео следовательно использовать можно только если есть постоянный возможность подзарядиться качество звук через динамик далеко не на высота наблюдаться незначительный тормоз в некоторый приложение и вообще в меню очень мало встроить память а приложение устанавливаться именно туда с это связанный неудобство нужно постоянно переносить они на карта память несколько неудобно что нету отдельный кнопка для фото подумывать купить батарея больший ёмкость мб что нибыть измениться


выглядит неплохо, попробуем лемматизировать отзывы с Яндекса.
Интересно, сколько это займёт времени?

In [18]:
%%time
texts_norm = norm_list(texts)

Wall time: 15h 41min 15s


шестнадцать часов, ёлы-палы

подыщем модель посимпатичней

In [19]:
for vctr in [CountVectorizer, TfidfVectorizer]:
    for trfr in [TfidfTransformer]:
        for clfr in [LogisticRegression, SGDClassifier, RidgeClassifier, RidgeClassifierCV]:
            print(vctr)
            print(trfr)
            print(clfr)
            print(cross_val_score(text_classifier(vctr(), trfr(), clfr()), texts_norm, labels).mean())
            print("\n")

<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.feature_extraction.text.TfidfTransformer'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>
0.798888888889


<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.feature_extraction.text.TfidfTransformer'>
<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>
0.79


<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.feature_extraction.text.TfidfTransformer'>
<class 'sklearn.linear_model.ridge.RidgeClassifier'>
0.801111111111


<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.feature_extraction.text.TfidfTransformer'>
<class 'sklearn.linear_model.ridge.RidgeClassifierCV'>
0.795555555556


<class 'sklearn.feature_extraction.text.TfidfVectorizer'>
<class 'sklearn.feature_extraction.text.TfidfTransformer'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>
0.814444444444


<class 'sklearn.feature_extraction.text.TfidfVect

In [20]:
pickle.dump(texts_norm, open("texts_norm.pkl","wb"))
pickle.dump(test_reviews_norm, open("test_norm.pkl","wb"))

на всякий случай сохранили наши лемматизированные отзывы

а теперь подбираем параметры лучшей модели

In [21]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RidgeClassifier()),
])

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 4000, 6000, 8000),
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__normalize': (True, False),
}

In [22]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(texts_norm, labels)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
('pipeline:', ['vect', 'tfidf', 'clf'])
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__normalize': (True, False),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__max_features': (None, 4000, 6000, 8000),
 'vect__ngram_range': ((1, 1), (1, 2), (1, 3))}
Fitting 3 folds for each of 576 candidates, totalling 1728 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 1728 out of 1728 | elapsed: 15.2min finished


done in 910.601s
()
Best score: 0.840
Best parameters set:
	clf__alpha: 1e-05
	clf__normalize: False
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 1.0
	vect__max_features: None
	vect__ngram_range: (1, 2)


In [23]:
model_norm = text_classifier(
    TfidfVectorizer(ngram_range=(1, 2)),
    TfidfTransformer(),
    RidgeClassifier(alpha=1e-05))

In [24]:
model_norm.fit(texts_norm, labels)

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_id...tercept=True, max_iter=None, normalize=False,
        random_state=None, solver='auto', tol=0.001))])

In [25]:
res_norm = model_norm.predict(test_reviews_norm)

In [26]:
t = []
for x in res_norm:
    if x > 0:
        t.append('pos')
    else:
        t.append('neg')
        
result = pd.DataFrame(data = t, columns=['y'])

result.to_csv('result_norm.csv',index_label=['Id'])

Результат на kaggle 0.9

Совсем другое дело