#### 

In [16]:
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV 
from sklearn.grid_search import RandomizedSearchCV 

from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from nltk.stem.snowball import RussianStemmer
import pymorphy2
import re

import sys

reload(sys)
sys.setdefaultencoding('utf8')

In [17]:
corpus = pd.read_csv('./data/data_morph.csv')
del corpus['Unnamed: 0']

In [18]:
corpus.head()

Unnamed: 0,text,label
0,камера металл корпус андроид отличный бюджетны...,1
1,дисплей присутствовать датчик ориентация в про...,1
2,пользоваться телефон 4 месяц не однократно лов...,0
3,отвратительный оболочка от samsung на android ...,0
4,1 очень медленный автоматический подстройка яр...,0


In [19]:
corpus.label.value_counts()

1    2424
0    2352
Name: label, dtype: int64

In [64]:
cv = StratifiedKFold(corpus.label, n_folds=10, shuffle=True, random_state=1)

pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer='word')),
    ('classifier', LogisticRegression())
])

pipeline_params = {
    'classifier__penalty': ['l1', 'l2'],
    'vectorizer__stop_words': [stopwords.words('russian'), None],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
    'vectorizer__norm': ['l1', 'l2']
}

grid = GridSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4)

grid.fit(corpus.text.values.astype('U'), corpus.label)
best = grid.best_estimator_
print(
    "Accuracy (TfidfVectorizer + LogisticRegression): {}, params {}" . format(grid.best_score_, grid.best_params_)
)
grid.best_score_

0.9899497487437185

In [52]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 3), analyzer='word' ,stop_words=stopwords.words('russian'), norm='l2'
)
X_vect = vectorizer.fit_transform(corpus.text.values.astype('U'))

classifier = LogisticRegression(penalty='l2')
fitted = classifier.fit(X_vect, corpus.label)

In [53]:
def clean_str(string):
    morph = pymorphy2.MorphAnalyzer()
    
    symbols = [
        ',', '.', '-', '*', '#', ')', '(', '/', '<', '>', ':', '+', '?', '!', '"', '"', '%', '=', '\\', '}'
    ]
    
    for symbol in symbols:
        string = str(string).replace(symbol, ' ')
        
    words = string.split()
    normalized_words = []
    
    for word in words:
        normalized_words.append(morph.parse(unicode(word.strip()))[0].normal_form)
        
    string = unicode(' '.join(normalized_words))
    
    return string

In [54]:
data_test = pd.read_csv('./data/test_data_to_predict.csv')
data_test['text'] = data_test.text.apply(clean_str)

In [55]:
texts_to_predict = data_test.text.values.astype('U')

In [56]:
X_test = vectorizer.transform(texts_to_predict)
predicts = fitted.predict(X_test)
predicts

array([0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1])

In [57]:
result = pd.DataFrame()

In [58]:
result['Id'] = range(0,100)
result['y'] = predicts

In [59]:
result.head()

Unnamed: 0,Id,y
0,0,0
1,1,1
2,2,0
3,3,0
4,4,1


In [60]:
result['y'] = result.y.apply(lambda x: 'neg' if x == 0 else 'pos')

In [61]:
result.head()

Unnamed: 0,Id,y
0,0,neg
1,1,pos
2,2,neg
3,3,neg
4,4,pos


In [62]:
#result['y'] = result.y.apply(lambda x: 'pos' if x == 0 else 'pos')

In [63]:
result.to_csv('./data/result.csv', index=None)