In [1]:
from datasets import load_dataset
import re

from gensim.models import FastText

import logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dataset = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1") # wikitext-2-raw-v1
dataset = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1") # 
train = dataset['train']['text']
valid = dataset['validation']['text']
test = dataset['test']['text']

In [3]:

# Step 2: Tokenize the text using regular expressions
def tokenize_with_re(data):
    tokenized_sentences = [
        re.findall(r'\b[a-zA-Z]+\b', sentence.lower()) for sentence in data if sentence.strip()
    ]
    return tokenized_sentences
    

# Tokenize each dataset split
train_tokenized = tokenize_with_re(train)
valid_tokenized = tokenize_with_re(valid)
test_tokenized = tokenize_with_re(test)

# Step 3: Train the Word2Vec model
# Combine train, validation, and test data for training
all_tokenized_data = train_tokenized + valid_tokenized + test_tokenized

# What is the longest sentence in the dataset?
max_sentence_len = max([len(sentence) for sentence in all_tokenized_data])
print(f"Max sentence length: {max_sentence_len}")


Max sentence length: 612


In [6]:
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

# Define and train the FastText model
model = FastText(
    sentences=all_tokenized_data,  # Tokenized sentences
    vector_size=150,               # Size of word vectors (embeddings)
    window=15,                     # Context window size
    min_count=1,                   # Minimum frequency for a word to be included in the model
    sg=1,                          # Use CBOW (0) or Skip-gram (1)
    workers=8,                     # Number of CPU cores to use
    epochs=50                      # Number of iterations over the corpus
)

# Save the trained FastText model
model.save("wikitext_small_fasttext.model")

2024-10-23 11:27:20,155 : INFO : collecting all words and their counts
2024-10-23 11:27:20,157 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-10-23 11:27:20,234 : INFO : PROGRESS: at sentence #10000, processed 705394 words, keeping 37689 word types
2024-10-23 11:27:20,290 : INFO : PROGRESS: at sentence #20000, processed 1407572 words, keeping 54637 word types
2024-10-23 11:27:20,341 : INFO : collected 66929 word types from a corpus of 2053065 raw words and 29119 sentences
2024-10-23 11:27:20,342 : INFO : Creating a fresh vocabulary
2024-10-23 11:27:20,410 : INFO : FastText lifecycle event {'msg': 'effective_min_count=1 retains 66929 unique words (100.00% of original 66929, drops 0)', 'datetime': '2024-10-23T11:27:20.410055', 'gensim': '4.3.3', 'python': '3.12.0 | packaged by Anaconda, Inc. | (main, Oct  2 2023, 12:22:05) [Clang 14.0.6 ]', 'platform': 'macOS-15.0-arm64-arm-64bit', 'event': 'prepare_vocab'}
2024-10-23 11:27:20,410 : INFO : FastText lifecy

In [7]:
# Test the model: Find similar words to a given word
similar_words = model.wv.most_similar("belgium", topn=10)
print(similar_words)

[('belgic', 0.6304888725280762), ('wallonia', 0.6188931465148926), ('flanders', 0.5946247577667236), ('wallonian', 0.5911387205123901), ('belgian', 0.5707005858421326), ('france', 0.5513981580734253), ('sweden', 0.5501424074172974), ('germany', 0.5411299467086792), ('walloonian', 0.5287423133850098), ('ultratip', 0.5263240933418274)]
