In [1]:
import pandas as pd
import numpy as np
from gensim.models import FastText
import pickle

In [2]:
with open('../hatexplain_data.pickle', 'rb') as file:
    data = pickle.load(file)

In [3]:
documents = data["documents"]
train_documents = data["train_documents"]
test_documents = data["test_documents"]

<h4>Format Curated Hatespeech Dataset</h4>

In [4]:
ft_vocab = pd.read_csv("embedding_training.csv")

In [5]:
ft_sentences = [doc.split() for doc in (ft_vocab["Content"].tolist())]

In [6]:
ft_all_sentences = ft_sentences + documents

<h4>fastText Training</h4>

In [7]:
# Define Word2Vec parameters
vector_size = 300  # You can experiment with different vector sizes
window_size = 3
min_count = 3
workers = 4  # Adjust the number of workers based on your CPU cores
epochs = 5

# Training the FastText model
ft_model = FastText(
    sentences=ft_sentences,
    vector_size=vector_size, 
    window=window_size,
    min_count=min_count,
    epochs=epochs
)

In [None]:
print(len(ft_model.wv))

In [None]:
similar_words = ft_model.wv.most_similar('<user>', topn=5)
print(similar_words)

In [8]:
def tokens_to_average_embedding(doc, model):
    embeddings = []
    for word in doc:
        embeddings.append(model.wv[word])
        
            
    # Calculate the average embedding. If no words were found, the result will be a zero vector.
    return np.mean(embeddings, axis=0)

<h4>fastText Embeddings</h4>

In [9]:
#FastText Embeddings
X_train_ft = np.array([tokens_to_average_embedding(doc, ft_model) for doc in train_documents])
X_test_ft = np.array([tokens_to_average_embedding(doc, ft_model) for doc in test_documents])

In [None]:
print(X_train_ft[0])

In [11]:
data_to_save = {
    "X_train_ft": X_train_ft,
    "X_test_ft": X_test_ft
}

with open('../saved_embeddings/fastText_embeddings.pickle', 'wb') as file:
    pickle.dump(data_to_save, file)

In [12]:
# Save the model
ft_model.save("../trained_embedding_models/my_fastText_model.model")