In [1]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
import string
from gensim.models import Word2Vec
import gensim

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [2]:
def preprocess_text(sentence):
    preprocessed_sentence = []
    for word in sentence:
        word = word.lower()
        word = word.translate(str.maketrans('', '', string.punctuation))
        preprocessed_sentence.append(word)
    return preprocessed_sentence

corpus = [preprocess_text(sentence) for sentence in brown.sents()]


In [3]:
# Define hyperparameters
window_sizes = [2, 5, 10]
dimensions = [25, 100, 150]
num_negative_samples = [1, 10]

# Loop over hyperparameters
for window in window_sizes:
    for dim in dimensions:
        for neg_samples in num_negative_samples:
            # Train Word2Vec model
            model = Word2Vec(sentences=corpus, vector_size=dim, window=window, negative=neg_samples, sg=1)

            # Save the trained model
            model_name = f"word2vec_model_window_{window}_dim_{dim}_neg_{neg_samples}.bin"
            model.save(model_name)

In [4]:
import numpy as np
import numpy.linalg as LA
import numpy as np
import os
import random

#BATS Evaluation function
def eval_bats_file(model, matrix, vocab, indices, f):
    pairs = [line.strip().split() for line in open(f, 'r').readlines()]
    pairs = [[p[0], p[1].split('/')] for p in pairs if p[0] in model.wv]
    pairs = [[p[0], [w for w in p[1] if w in model.wv]] for p in pairs]
    pairs = [p for p in pairs if len(p[1]) > 0]
    if len(pairs) <= 1:
        return None
    transposed = np.transpose(np.array([x / LA.norm(x) for x in matrix]))
    qa = []
    qb = []
    qc = []
    targets = []
    exclude = []
    groups = []
    for i in range(len(pairs)):
        j = random.randint(0, len(pairs) - 2)
        if j >= i: j += 1
        a = model.wv[pairs[i][0]]
        c = model.wv[pairs[j][0]]
        for bw in pairs[i][1]:
            qa.append(a)
            qb.append(model.wv[bw])
            qc.append(c)
            groups.append(i)
            targets.append(pairs[j][1])
            exclude.append([pairs[i][0], bw, pairs[j][0]])

    for queries in [qa, qb, qc]:
        queries = np.array([x / LA.norm(x) for x in queries])
    sa = np.matmul(qa, transposed) + .0001
    sb = np.matmul(qb, transposed)
    sc = np.matmul(qc, transposed)
    sims = sb + sc - sa
    for i in range(len(exclude)):
        for w in exclude[i]:
            sims[i][indices[w]] = 0
    preds = [vocab[np.argmax(x)] for x in sims]
    accs = [1 if preds[i].lower() in targets[i] else 0 for i in range(len(preds))]
    regrouped = np.zeros(np.max(groups) + 1)
    for a, g in zip(accs, groups):
        regrouped[g] = max(a, regrouped[g])
    return np.mean(regrouped)

In [5]:
import numpy as np
import os

# Loop over hyperparameters
for window in window_sizes:
    for dim in dimensions:
        for neg_samples in num_negative_samples:

            # Load Learned Model
            dim = dim  # dimension of vector
            win = window  # windows size
            ns = neg_samples  # Number of Negative Sample
            model_name = f"word2vec_model_window_{window}_dim_{dim}_neg_{neg_samples}.bin"
            new_model = gensim.models.Word2Vec.load(model_name)


            # Evaluate wordsim353
            model_evalu_wordsim353 = new_model.wv.evaluate_word_pairs('/content/wordsim353.tsv')[0][0] * 100
            model_evalu_win353 = new_model.wv.evaluate_word_pairs('/content/win353.tsv')[0][0] * 100

            # Extract Vocab and Matrices from model
            indices = {}
            vocab = [k for k in model.wv.key_to_index.keys()]
            for i in range(len(vocab)):
                indices[vocab[i]] = i

            matrix = []
            for w in vocab:
                matrix.append(model.wv[w])
            matrix = np.array(matrix)

            accs = {}
            base = '/content/BATS'
            for dr in os.listdir(base):
                dr_path = os.path.join(base, dr)
                if os.path.isdir(dr_path):
                    dk = dr.split('_', 1)[1].lower()
                    accs[dk] = []
                    for f in os.listdir(dr_path):
                        file_path = os.path.join(dr_path, f)
                        if os.path.isfile(file_path):  # Make sure to only open files
                            accs[f.split('.')[0]] = eval_bats_file(model, matrix, vocab, indices, file_path)
                            accs[dk].append(accs[f.split('.')[0]])
                    accs[dk] = [a for a in accs[dk] if a is not None]
                    accs[dk] = np.mean(accs[dk]) if len(accs[dk]) > 0 else None

            accs['total'] = np.mean([accs[k] for k in accs.keys() if accs[k] is not None])

            # Create the results list
            # Create the results list
            results = ['Word2Vec', win, dim, ns, model_evalu_wordsim353, model_evalu_win353]
            for k in accs.keys():
                results.append(accs[k])

            # گرد کردن اعداد به دو رقم اعشار
            results_rounded = [round(item, 16) if isinstance(item, float) else item for item in results]

            print(results_rounded)




['Word2Vec', 2, 25, 1, 1.6835928423588067, -1.4797754224759838, 0.0208603131174951, 0.0465116279069767, 0.0, 0.0444444444444444, 0.0, 0.0, 0.1176470588235294, 0.0, 0.0, 0.0, 0.0, 0.0270070683191251, None, 0.0, 0.025, 0.0222222222222222, 0.0851063829787234, 0.0681818181818182, 0.0, 0.0, 0.0, 0.0425531914893617, 0.0, 0.0, 0.0, None, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0346378767431399, 0.0, 0.0, None, 0.0, 0.0, 0.0, 0.0, 0.0526315789473684, 0.1052631578947368, 0.1538461538461538, 0.0206320218271974]
['Word2Vec', 2, 25, 10, 17.2954113493433, 14.487472602007367, 0.0204514363885089, 0.1162790697674418, 0.0, 0.0, 0.0, 0.0, 0.0882352941176471, 0.0, 0.0, 0.0, 0.0, 0.0268648328354744, None, 0.0, 0.025, 0.0222222222222222, 0.0638297872340426, 0.0, 0.0, 0.0416666666666667, 0.0465116279069767, 0.0425531914893617, 0.0109395109395109, 0.0, 0.0, None, 0.0, 0.027027027027027, 0.0, 0.0, 0.0714285714285714, 0.0, 0.0, 0.0715420057525321, 0.0, 0.0, None, 0.0, 0.0, 0.0909090909090909, 0.0833333333333333, 