In [1]:
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('word2vec/vectors.bin', binary=False)

In [2]:
model.save('word2vec.model')
model.most_similar(positive=['anne', 'kral'], negative=['erkek'])

[('krali', 0.6292586326599121),
 ('kralice', 0.6282593607902527),
 ('norodom', 0.6125580668449402),
 ('sihanouk', 0.5838896632194519),
 ('kralicesi', 0.5752500891685486),
 ('silmariene', 0.5721966028213501),
 ('edwardin', 0.5614503622055054),
 ('teoderik', 0.5602492094039917),
 ('prensi', 0.557105302810669),
 ('hirodes', 0.5560301542282104)]

In [13]:
from process_ds import *
import numpy as np
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def get_and_clean(dataset):
    df = pd.read_csv(
        dataset, usecols=['comment', 'Label'], encoding='unicode_escape')

    drop = []
    for index, row in df.iterrows():
        content = row['comment']
        translator = str.maketrans(
            string.punctuation, ' '*len(string.punctuation))
        content = content.translate(translator)
        content = preprocess(content)
        df.loc[index, 'comment'] = content
        if not df.loc[index, 'comment']:
            drop.append(index)
    df = df.drop(index=drop)
    return np.array(df['comment']), np.array(df['Label'])

In [14]:
X_train, y_train = get_and_clean('dataset/train.csv')
X_train.shape, y_train.shape

((7991,), (7991,))

In [15]:
X_test, y_test = get_and_clean('dataset/test.csv')
X_test.shape, y_test.shape

((2665,), (2665,))

In [16]:
model['biri']

array([ 1.329586,  0.502957, -1.970287, -0.780498,  0.719182, -1.566812,
        2.826463, -2.613646,  2.6055  ,  2.190009,  0.01035 , -1.141106,
        1.139917,  1.692133,  5.008232, -2.969589,  1.641621,  1.685976,
        1.296329,  3.856552,  0.542349,  0.862442,  1.865268, -2.799784,
       -2.092626,  3.27122 , -0.301047, -3.995167,  1.798686,  2.143213,
       -3.792131, -1.586872, -1.419663, -0.63752 ,  5.930891,  1.433477,
        4.245801, -0.761896, -4.553113, -2.136155,  0.828079, -3.103901,
       -1.347521, -1.637442,  2.763175,  0.976209, -1.601628,  2.946443,
       -0.342828,  0.122558, -5.228163,  2.380381,  2.421279, -0.268938,
       -1.324287, -0.817046,  2.177338, -1.676326,  2.14197 , -0.539436,
        3.881288,  0.595402,  1.408362, -2.283121, -5.496751, -0.061185,
       -0.103148,  2.258779, -0.134703, -3.748308, -0.289   , -3.733705,
       -2.39455 , -1.509682,  5.108567, -0.455409,  1.978857, -0.288489,
        0.428164, -1.433106, -2.658688, -3.45943 , 

In [17]:
def get_extrem(v, get='min'):
    vmin = v[0]
    if get == 'min':
        func = np.minimum
    elif get == 'max':
        func = np.maximum
    for vi in v:
        vmin = func(vmin, vi)
    return vmin

def feature_vector(v):
    return np.concatenate((get_extrem(v, get='min'), get_extrem(v, get='max')))

def to_feature_set(X):
    new_X = []
    for i, x in enumerate(X):
        v = [model[w] for w in x.split()]
        new_X.append(feature_vector(v))
    return np.array(new_X)

In [18]:
a = to_feature_set(X_train)
a.shape

(7991, 200)

In [19]:
b = to_feature_set(X_test)
b.shape

(2665, 200)

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score


model = GaussianNB()
model.fit(a, y_train)
y_pred = model.predict(b)
print(f1_score(y_test, y_pred, average='micro'))

0.6517823639774859
