In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import pickle

In [2]:
with open('../hatexplain_data.pickle', 'rb') as file:
    data = pickle.load(file)

In [3]:
documents = data["documents"]
train_documents = data["train_documents"]
test_documents = data["test_documents"]
documents_binary = data["documents_binary"]
train_documents_binary = data["train_documents_binary"]
test_documents_binary = data["test_documents_binary"]

<h4>Format Curated Hatespeech Dataset</h4>

In [4]:
w2v_vocab = pd.read_csv("embedding_training.csv")

In [5]:
w2v_sentences = [doc.split() for doc in (w2v_vocab["Content"].tolist())]

In [6]:
w2v_all_sentences = w2v_sentences + documents

<h4>Word2Vec Training</h4>

In [7]:
vector_size = 300 
window_size = 2
min_count = 3
workers = 4 
epochs = 5

w2v_model = Word2Vec(
                 sentences=w2v_all_sentences,
                 vector_size=vector_size,
                 window=window_size,
                 min_count=min_count,
                 workers=workers,
                 epochs=epochs
                )


In [8]:
# print(len(w2v_model.wv))

In [9]:
# similar_words = w2v_model.wv.most_similar('<number>', topn=10)
# print(similar_words)

In [10]:
def tokens_to_average_embedding(doc, model):
    embeddings = []
    for word in doc:
        # Check if the word is in the model
        if word in model.wv:
            embeddings.append(model.wv[word])
        else:
            # If the word is OOV, append a zero vector
            pass
            
    # Calculate the average embedding. If no words were found, the result will be a zero vector.
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.vector_size)

<h4>W2V Embeddings</h4>

In [11]:
#Word2Vec Embeddings
X_train_w2v = np.array([tokens_to_average_embedding(doc, w2v_model) for doc in train_documents])
X_test_w2v = np.array([tokens_to_average_embedding(doc, w2v_model) for doc in test_documents])

X_train_w2v_binary = np.array([tokens_to_average_embedding(doc, w2v_model) for doc in train_documents_binary])
X_test_w2v_binary = np.array([tokens_to_average_embedding(doc, w2v_model) for doc in test_documents_binary])

In [12]:
print(X_train_w2v_binary.shape)

(10683, 300)


In [13]:
data_to_save = {
    "X_train_w2v": X_train_w2v,
    "X_test_w2v": X_test_w2v,
    "X_train_w2v_binary": X_train_w2v_binary,
    "X_test_w2v_binary": X_test_w2v_binary
}

with open('../saved_embeddings/w2v_embeddings.pickle', 'wb') as file:
    pickle.dump(data_to_save, file)

In [14]:
# Save the model
w2v_model.save("../trained_embedding_models/my_w2v_model.model")