In [2]:
import pandas as pd
import numpy as np
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import pickle

In [4]:
with open('../hatexplain_data.pickle', 'rb') as file:
    data = pickle.load(file)

In [6]:
documents = data["documents"]
train_documents = data["train_documents"]
test_documents = data["test_documents"]

<h4>Format Curated Hatespeech Dataset</h4>

In [7]:
d2v_vocab = pd.read_csv("embedding_training.csv")

In [8]:
d2v_sentences = [doc.split() for doc in (d2v_vocab["Content"].tolist())]

In [9]:
d2v_all_sentences = d2v_sentences + documents

In [10]:
print(len(d2v_all_sentences))

745348


<h4>Doc2Vec Training</h4>

In [12]:
vector_size = 300 
window_size = 2
min_count = 3
workers = 4
epochs = 5

tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(d2v_all_sentences)]
d2v_model = Doc2Vec(tagged_data, vector_size=300, window=5, min_count=5, workers=4, epochs=epochs)


In [13]:
print(len(d2v_model.wv))

62770


In [14]:
similar_words = d2v_model.wv.most_similar('a', topn=5)
print(similar_words)

[('this', 0.677055835723877), ('the', 0.6677677631378174), ('that', 0.6171519756317139), ('it', 0.6042236685752869), ('another', 0.574511706829071)]


<h4>D2V Embeddings</h4>

In [15]:
def create_doc_vector(documents, model):
    matrix = []
    for doc in documents:
        vec = model.infer_vector(doc)
        matrix.append(vec)
    matrix = np.array(matrix)
    return matrix

In [16]:
X_train_d2v = create_doc_vector(train_documents, d2v_model)
X_test_d2v = create_doc_vector(test_documents, d2v_model)

In [17]:
data_to_save = {
    "X_train_d2v": X_train_d2v,
    "X_test_d2v": X_test_d2v
}

with open('../saved_embeddings/d2v_embeddings.pickle', 'wb') as file:
    pickle.dump(data_to_save, file)

In [18]:
# Save the model
d2v_model.save("../trained_embedding_models/my_d2v_model.model")