In [1]:
from pandas import read_csv, Series
from gensim.models import Word2Vec
import numpy as np

In [2]:
from pymorphy2 import MorphAnalyzer
from nltk.tokenize import RegexpTokenizer
morph = MorphAnalyzer()
alpha_tokenizer = RegexpTokenizer('[A-Za-zА-Яа-я]\w+')

In [20]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [4]:
df = read_csv('2ch_test_set.csv')
del df['index']

In [5]:
model = Word2Vec.load('2ch_model')
vocab = model.wv.vocab

In [6]:
def pre_process(sentence):
    sentence = sentence.lower()
    tokens = alpha_tokenizer.tokenize(sentence)
    model_tokens = []
    for index, word in enumerate(tokens):
        lem_word = morph.parse(word.lower())[0].normal_form
        if lem_word in vocab:
            model_tokens.append(lem_word)
    return model_tokens

In [7]:
old_err_state = np.seterr(all='raise')

def vectorize_message(sentence, model, num_features, vocab):
    featureVec = np.zeros((num_features), dtype='float32')
    nwords = 0
    
    tokens = pre_process(sentence.lower())

    for word in tokens:
        if word in vocab: 
            featureVec = np.add(featureVec, model[word])
            nwords = nwords + 1
    try:
        featureVec = np.divide(featureVec, nwords)
    except FloatingPointError:
         featureVec = np.zeros((num_features), dtype='float32')
    return featureVec

In [8]:
vec = []

for i in df.text.values:
    vec.append(vectorize_message(i, model, 400, vocab))

In [9]:
CHUNK = 40

X = np.array(vec)
Y = np.array(df.is_relevent.values)
X_train = X[:~CHUNK]
Y_train = Y[:~CHUNK]
X_test = X[~CHUNK:]
Y_test = Y[~CHUNK:]

In [10]:
X_test.shape

(41, 400)

In [11]:
clf = LogisticRegression(C = 1)
clf = clf.fit(X_train, Y_train)
result = clf.predict(X_test)
accuracy_score(Y_test, result)

0.75609756097560976

In [15]:
clf = RandomForestClassifier(n_estimators = 150)
clf = clf.fit(X_train, Y_train)
result = clf.predict(X_test)
accuracy_score(Y_test, result)

0.80487804878048785

In [29]:
clf = SVC(C = 0.01, kernel = 'linear')
clf = clf.fit(X_train, Y_train)
result = clf.predict(X_test)
accuracy_score(Y_test, result)

0.75609756097560976

In [16]:
clf.predict(np.array(vectorize_message('питон + генсим = годнота!', model, 400, vocab)).reshape(1,-1))

array([1])

In [19]:
clf.predict(np.array(vectorize_message('пошёл нахуй зелёный веб макакер', model, 400, vocab)).reshape(1,-1))

array([0])