In [1]:
from pandas import DataFrame, read_csv
from string_utils import morph_parse, make_tokens
from gensim.models import Word2Vec
from scipy import spatial
from numpy import zeros, add
from sklearn.metrics import f1_score
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
df = read_csv('annotated.csv', encoding='cp1251')
df.comment = df.comment.apply(morph_parse)
df.reference = df.reference.apply(morph_parse)
df['labels'] = df['labels'].astype(int)

In [3]:
df['labels'].value_counts()

0    781
1    739
Name: labels, dtype: int64

In [4]:
word2vec = Word2Vec.load('all_lem_100')

In [5]:
def relatedness(message, reference, num_features=100):
    featureVec1 = zeros((num_features), dtype='float32')
    for word in make_tokens(message.lower(), word2vec.wv.vocab):
        featureVec1 = add(featureVec1, word2vec[word])
    featureVec2 = zeros((num_features), dtype='float32')
    for word in make_tokens(reference.lower(), word2vec.wv.vocab):
        featureVec2 = add(featureVec2, word2vec[word])
    return 1 - spatial.distance.cosine(featureVec1, featureVec2)

In [6]:
results = zeros(len(df), dtype='float32')

for i, m in df.iterrows():
    results[i] = relatedness(m['comment'], m['reference'])

In [7]:
results = [1 if i > 0.2 else 0 for i in results]

In [8]:
f1_score(df['labels'], results)

0.65059004617752703