In [1]:
from datasets import load_dataset
import re

from gensim.models import FastText

import logging

In [2]:
# dataset = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1") # wikitext-2-raw-v1
dataset = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1") # 
train = dataset['train']['text']
valid = dataset['validation']['text']
test = dataset['test']['text']

In [3]:

# Step 2: Tokenize the text using regular expressions
def tokenize_with_re(data):
    tokenized_sentences = [
        re.findall(r'\b[a-zA-Z]+\b', sentence.lower()) for sentence in data if sentence.strip()
    ]
    return tokenized_sentences
    

# Tokenize each dataset split
train_tokenized = tokenize_with_re(train)
valid_tokenized = tokenize_with_re(valid)
test_tokenized = tokenize_with_re(test)

# Step 3: Train the Word2Vec model
# Combine train, validation, and test data for training
all_tokenized_data = train_tokenized + valid_tokenized + test_tokenized

# What is the longest sentence in the dataset?
max_sentence_len = max([len(sentence) for sentence in all_tokenized_data])
print(f"Max sentence length: {max_sentence_len}")


Max sentence length: 612


In [4]:
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

# Define and train the FastText model
model = FastText(
    sentences=all_tokenized_data,  # Tokenized sentences
    vector_size=150,               # Size of word vectors (embeddings)
    window=15,                     # Context window size
    min_count=1,                   # Minimum frequency for a word to be included in the model
    sg=1,                          # Use CBOW (0) or Skip-gram (1)
    workers=8,                     # Number of CPU cores to use
    epochs=50                      # Number of iterations over the corpus
)

# Save the trained FastText model
model.save("wikitext_small_fasttext.model")

2024-10-25 09:24:23,325 : INFO : collecting all words and their counts
2024-10-25 09:24:23,326 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-10-25 09:24:23,395 : INFO : PROGRESS: at sentence #10000, processed 705394 words, keeping 37689 word types
2024-10-25 09:24:23,453 : INFO : PROGRESS: at sentence #20000, processed 1407572 words, keeping 54637 word types
2024-10-25 09:24:23,502 : INFO : collected 66929 word types from a corpus of 2053065 raw words and 29119 sentences
2024-10-25 09:24:23,502 : INFO : Creating a fresh vocabulary
2024-10-25 09:24:23,573 : INFO : FastText lifecycle event {'msg': 'effective_min_count=1 retains 66929 unique words (100.00% of original 66929, drops 0)', 'datetime': '2024-10-25T09:24:23.573267', 'gensim': '4.3.3', 'python': '3.12.3 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 11:44:52) [Clang 14.0.6 ]', 'platform': 'macOS-14.6.1-arm64-i386-64bit', 'event': 'prepare_vocab'}
2024-10-25 09:24:23,573 : INFO : FastText lif

In [5]:
# Test the model: Find similar words to a given word
similar_words = model.wv.most_similar("day", topn=10)
print(similar_words)

[('next', 0.6294162273406982), ('following', 0.5515981912612915), ('days', 0.546765923500061), ('present', 0.5447957515716553), ('time', 0.5174322128295898), ('hours', 0.5116883516311646), ('celebrating', 0.5100945830345154), ('august', 0.4965319335460663), ('dayli', 0.49179965257644653), ('midden', 0.4871538579463959)]
