In [1]:
import pandas as pd

In [239]:
df = pd.read_csv("/Users/anton/PycharmProjects/afishaparser2/afisha_parser/copy_text_db.csv", index_col=0)

In [240]:
df.head(5)

Unnamed: 0,classId,raiting,text
0,222122,1,Феерический бред со спецэфектами.
1,222122,1,Не в коем случае не смотреть в кинотеатре))) с...
2,222122,5,"Офиегнный фильм, один из лучших за послежнее в..."
3,222122,1,Ужасно. Спали полфильма.... Для галочки сходил...
4,222122,1,Тут тома и джери не хватает .


In [241]:
data = df
df = df[df.raiting != 0]
df = df[pd.isnull(df.text) == False]

In [242]:
df.raiting.describe()

count    13483.000000
mean         4.030557
std          1.334874
min          1.000000
25%          3.000000
50%          5.000000
75%          5.000000
max          5.000000
Name: raiting, dtype: float64

In [243]:
df.raiting.value_counts()

5    7348
4    2704
1    1371
3    1297
2     763
Name: raiting, dtype: int64

In [244]:
df.raiting.value_counts()/df.raiting.count()

5    0.544983
4    0.200549
1    0.101684
3    0.096195
2    0.056590
Name: raiting, dtype: float64

In [285]:
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem.snowball import SnowballStemmer

TAG_RE = re.compile(r'<[^>]+>')
CHARACTER_RE = re.compile(r'[\r\n]')

def remove_tags(text):
    return TAG_RE.sub('', text)

def remove_symbols(text):
    try:
        return re.sub(r'[^а-яА-Яa-zA-Z]+', ' ', text).lower()  
    except TypeError as e:
        print (text)

def stemming(words):
    stemmer = SnowballStemmer("russian")
    return [stemmer.stem(word) for word in words]

def remove_stopwords(words):
    rus_stopwords = stopwords.words("russian")
    return list(filter(lambda x: x not in rus_stopwords, words)) 

def preprocessing(text):
    return " ".join(stemming(remove_stopwords(remove_tags(remove_symbols(text)).split())))

In [286]:
df.text = df.text.map(lambda text: preprocessing(text))

In [287]:
from sklearn.cross_validation import train_test_split
X = df.text
y = df.raiting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [288]:
from sklearn.metrics.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from stop_words import get_stop_words

In [326]:
from sklearn.feature_extraction.text import TfidfVectorizer
pipeline = Pipeline([
           ('vect', TfidfVectorizer(analyzer = "word", norm='l2', sublinear_tf=True)),
           ('clf', LogisticRegression())
       ])
parameters = {
           'vect__max_df': (0.25, 0.5),
           'vect__ngram_range': ((1, 1), (1, 2)),
           'vect__use_idf': (True, False),
           'clf__C': (0.1, 1, 10),
       }

grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')

In [328]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   42.8s
[Parallel(n_jobs=3)]: Done  72 out of  72 | elapsed:  1.2min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'vect__ngram_range': ((1, 1), (1, 2)), 'clf__C': (0.1, 1, 10), 'vect__max_df': (0.25, 0.5), 'vect__use_idf': (True, False)},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=1)

In [329]:
print(grid_search.best_score_)
best_parameters = grid_search.best_estimator_.get_params()
print(best_parameters)
for param_name in sorted(parameters.keys()):
        print('\t{0}: {1}'.format(param_name, best_parameters[param_name]))

0.608989261596
{'clf__warm_start': False, 'vect__binary': False, 'clf__fit_intercept': True, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__stop_words': None, 'vect__tokenizer': None, 'clf__n_jobs': 1, 'vect__min_df': 1, 'vect__dtype': <class 'numpy.int64'>, 'clf__max_iter': 100, 'clf__verbose': 0, 'vect__ngram_range': (1, 2), 'vect__sublinear_tf': True, 'clf__C': 10, 'vect__vocabulary': None, 'vect__input': 'content', 'clf__dual': False, 'clf__intercept_scaling': 1, 'vect__use_idf': True, 'clf__tol': 0.0001, 'clf__class_weight': None, 'clf': LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'vect__decode_error': 'strict', 'vect__smooth_idf': True, 'vect__analyzer': 'word', 'clf__multi_class': 'ovr', 'clf__random_state': None, 'steps': [('vec

In [330]:
predictions = grid_search.predict(X_test)
print('Accuracy:', accuracy_score(y_test, predictions))
print('Confusion Matrix:\n', confusion_matrix(y_test, predictions))
print('Classification Report:\n', classification_report(y_test, predictions))

Accuracy: 0.614606741573
Confusion Matrix:
 [[ 239    8   23   42  146]
 [  66   11   31   40  100]
 [  31    4   42  142  230]
 [  13    2   17  211  657]
 [  22    1    8  132 2232]]
Classification Report:
              precision    recall  f1-score   support

          1       0.64      0.52      0.58       458
          2       0.42      0.04      0.08       248
          3       0.35      0.09      0.15       449
          4       0.37      0.23      0.29       900
          5       0.66      0.93      0.78      2395

avg / total       0.56      0.61      0.55      4450



In [306]:
sentences = df.text.map(lambda sent: sent.split(" ")).tolist()

In [299]:
from gensim.models import word2vec

In [300]:
num_features = 300                    
min_word_count = 40                        
num_workers = 6
context = 10
downsampling = 1e-3

In [307]:
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

In [308]:
model.init_sims(replace=True)

In [309]:
model_name = "300features_40minwords_10context"
model.save(model_name)

In [334]:
model.syn0.shape

(1889, 300)

In [343]:
#model.index2word

In [347]:
np.zeros((num_features,),dtype="float32")

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [413]:
def makeFeatureVec(words, model, num_features):

    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    
    index2word_set = set(model.index2word)
    
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])

    if nwords != 0:
        featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0.
    
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    
    for review in reviews:
        if counter % 1000. == 0.:
            print ("Review {0} of {1}".format(counter, len(reviews)))
       
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1.
    return reviewFeatureVecs

In [414]:
train_review = X_train.map(lambda sent: sent.split(" ")).tolist()

In [415]:
test_review = X_test.map(lambda sent: sent.split(" ")).tolist()

In [416]:
train_data_vec = getAvgFeatureVecs( train_review, model, num_features )

Review 0.0 of 9033
Review 1000.0 of 9033
Review 2000.0 of 9033
Review 3000.0 of 9033
Review 4000.0 of 9033
Review 5000.0 of 9033
Review 6000.0 of 9033
Review 7000.0 of 9033
Review 8000.0 of 9033
Review 9000.0 of 9033




In [417]:
test_data_vec = getAvgFeatureVecs( test_review, model, num_features )

Review 0.0 of 4450
Review 1000.0 of 4450
Review 2000.0 of 4450
Review 3000.0 of 4450
Review 4000.0 of 4450




In [429]:

parameters = {
           'C': (0.1, 1, 10)
       }
word2vec_grid_search = GridSearchCV(clf, parameters, n_jobs=3, verbose=1, scoring='accuracy')

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [440]:
clf = LogisticRegression(C = 1000)
clf.fit(train_data_vec, y_train)
pred = clf.predict(test_data_vec)
accuracy_score(y_test, pred)

0.58314606741573038

In [444]:
print_result(y_test, pred)

Accuracy: 0.583146067416
Confusion Matrix:
 [[ 197    1   13   43  204]
 [  60    0   18   36  134]
 [  36    1   28   75  309]
 [  27    0   14  106  753]
 [  42    0   14   75 2264]]
Classification Report:
              precision    recall  f1-score   support

          1       0.54      0.43      0.48       458
          2       0.00      0.00      0.00       248
          3       0.32      0.06      0.10       449
          4       0.32      0.12      0.17       900
          5       0.62      0.95      0.75      2395

avg / total       0.49      0.58      0.50      4450



Accuracy: 0.583146067416
Confusion Matrix:
 [[ 197    1   13   43  204]
 [  60    0   18   36  134]
 [  36    1   28   75  309]
 [  27    0   14  106  753]
 [  42    0   14   75 2264]]
Classification Report:
              precision    recall  f1-score   support

          1       0.54      0.43      0.48       458
          2       0.00      0.00      0.00       248
          3       0.32      0.06      0.10       449
          4       0.32      0.12      0.17       900
          5       0.62      0.95      0.75      2395

avg / total       0.49      0.58      0.50      4450



In [442]:
def print_result(y_test, predictions):
    print('Accuracy:', accuracy_score(y_test, predictions))
    print('Confusion Matrix:\n', confusion_matrix(y_test, predictions))
    print('Classification Report:\n', classification_report(y_test, predictions))