In [1]:
from gensim.models import KeyedVectors

word2vec = KeyedVectors.load_word2vec_format('word2vec/vectors_400.bin', binary=False)

In [15]:
#word2vec.save('word2vec.model')
w2v_dim = 400
word2vec.most_similar(positive=['anne', 'kral'], negative=['erkek'])

[('krali', 0.46581345796585083),
 ('kralice', 0.46316829323768616),
 ('kralicesi', 0.4226829409599304),
 ('kralin', 0.4212174713611603),
 ('edwardin', 0.41212937235832214),
 ('richardin', 0.41065317392349243),
 ('krallari', 0.4024549126625061),
 ('henrynin', 0.39439308643341064),
 ('charlesin', 0.3926564157009125),
 ('kralinin', 0.3692318797111511)]

In [3]:
from process_ds import *
import numpy as np
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def get_and_clean(dataset):
    df = pd.read_csv(
        dataset, usecols=['comment', 'Label'], encoding='unicode_escape')

    drop = []
    for index, row in df.iterrows():
        content = row['comment']
        translator = str.maketrans(
            string.punctuation, ' '*len(string.punctuation))
        content = content.translate(translator)
        content = preprocess(content)
        df.loc[index, 'comment'] = content
        if not df.loc[index, 'comment']:
            drop.append(index)
    df = df.drop(index=drop)
    return np.array(df['comment']), np.array(df['Label'])

In [5]:
words_train, y_train = get_and_clean('dataset/train.csv')
words_train.shape, y_train.shape

((7991,), (7991,))

In [6]:
words_test, y_test = get_and_clean('dataset/test.csv')
words_test.shape, y_test.shape

((2665,), (2665,))

In [7]:
word2vec['biri']

array([ 3.228896, -0.147009, -1.931883, -1.020285, -2.318674,  0.210398,
       -2.645644,  1.263047, -0.38474 , -0.289636,  0.500422,  2.070905,
       -0.662991, -2.315848,  0.645493,  2.275743, -1.737326, -0.538942,
       -2.024382,  0.485932,  3.148136, -0.922693, -0.925424,  1.049409,
        0.449243, -1.0477  , -0.266813,  0.912633, -0.063486,  0.306128,
        0.579044, -0.750976, -1.502728, -0.536388,  0.782174, -1.448138,
       -1.695138,  3.203634, -3.157739, -1.638112, -0.105682,  1.814888,
        0.045683, -1.917891, -0.29747 , -0.281818,  0.219515, -1.420733,
        0.199821, -0.88874 , -0.432684,  2.162875, -0.999417,  0.418473,
       -0.699038, -0.873585,  0.810698, -0.856115, -2.023203,  1.560693,
       -1.435654, -0.576036, -0.29178 ,  2.598387, -0.776041, -2.541595,
        0.366221, -1.811885,  1.092941, -1.364043, -1.710791, -0.187191,
       -0.083885, -1.928778,  0.786471,  3.330691, -1.521363, -1.427381,
        4.178517,  0.407285,  1.873843, -0.26251 , 

In [8]:
def get_extrem(v, get='min'):
    vmin = v[0]
    if get == 'min':
        func = np.minimum
    elif get == 'max':
        func = np.maximum
    for vi in v:
        vmin = func(vmin, vi)
    return vmin

def feature_vector(v):
    return np.concatenate((get_extrem(v, get='min'), get_extrem(v, get='max')))

def min_max_transform(X, word2vec):
    new_X = []
    for x in X:
        v = [word2vec[w] for w in x.split()]
        new_X.append(feature_vector(v))
    return np.array(new_X)

In [9]:
def mean_transform(X, word2vec):
    new_X = []
    for x in X:
        v = [word2vec[w] for w in x.split()]
        w = np.zeros(w2v_dim)
        for vi in v:
            w += vi
        w /= len(vi)
        new_X.append(w)
    return np.array(new_X)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict 

def tfidf_transform(X, word2vec):
    tfidf = TfidfVectorizer()
    tfidf.fit(X)
    max_idf = max(tfidf.idf_)
    word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    new_X = []
    return np.array([
                np.mean([word2vec[w] * word2weight[w]
                         for w in words.split()], axis=0)
                for words in X
            ])

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

def benchmark(transform):
    X_train = transform(words_train, word2vec)
    X_test = transform(words_test, word2vec)
    print(X_train.shape, X_test.shape)
    model = GaussianNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('f1 score:', f1_score(y_test, y_pred, average='micro'))

In [12]:
benchmark(min_max_transform)

(7991, 800) (2665, 800)
f1 score: 0.6656660412757974


In [13]:
benchmark(mean_transform)

(7991, 400) (2665, 400)
f1 score: 0.6731707317073171


In [14]:
benchmark(tfidf_transform)

(7991, 400) (2665, 400)
f1 score: 0.7166979362101313
