In [18]:
from pandas import read_csv, Series
from gensim.models import Word2Vec
import numpy as np

In [19]:
from pymorphy2 import MorphAnalyzer
from nltk.tokenize import RegexpTokenizer
morph = MorphAnalyzer()
alpha_tokenizer = RegexpTokenizer('[A-Za-zА-Яа-я]\w+')

In [78]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [26]:
df = read_csv('test_set.txt', sep= '\n', names = ['text'])
test_y = [line.split('\n')[0] for line in open('test_set_y.txt')]
df['y'] = Series(test_y, index = df.index)
df['y'] = df.y.astype(int)

In [27]:
model = Word2Vec.load('2ch_model')
vocab = model.wv.vocab

In [28]:
def model_process_words(words):
    words = words.lower()
    tokens = alpha_tokenizer.tokenize(words)
    model_tokens = []
    for index, word in enumerate(tokens):
        lem_word = morph.parse(word.lower())[0].normal_form
        if lem_word in vocab:
            model_tokens.append(lem_word)
    return model_tokens

In [53]:
old_err_state = np.seterr(all='raise')

def makeFeatureVec(words, model, num_features, vocab):
    featureVec = np.zeros((num_features), dtype='float32')
    nwords = 0
    
    words = model_process_words(words.lower())

    for word in words:
        if word in vocab: 
            featureVec = np.add(featureVec, model[word])
            nwords = nwords + 1
    try:
        featureVec = np.divide(featureVec, nwords)
    except FloatingPointError:
         featureVec = np.zeros((num_features), dtype='float32')
    return featureVec

In [89]:
vec = []

for i in df.text.values:
    vec.append(makeFeatureVec(i, model, 400, vocab))

In [90]:
X = np.array(vec)
Y = np.array(df.y.values)
X_train = X[:~9]
Y_train = Y[:~9]
X_test = X[~9:]
Y_test = Y[~9:]

In [91]:
X_test.shape

(10, 400)

In [92]:
clf = LogisticRegression(C = 1)
clf = clf.fit(X_train, Y_train)
result = clf.predict(X_test)
accuracy_score(Y_test, result)

0.90000000000000002

In [93]:
clf = RandomForestClassifier(n_estimators = 150)
clf = clf.fit(X_train, Y_train)
result = clf.predict(X_test)
accuracy_score(Y_test, result)

0.90000000000000002

In [101]:
clf.predict(np.array(makeFeatureVec('питон + генсим = годнота!', model, 400, vocab)).reshape(1,-1))

array([1])

In [108]:
clf.predict(np.array(makeFeatureVec('пошёл нахуй зелёный', model, 400, vocab)).reshape(1,-1))

array([0])