In [1]:
from datasets import load_dataset
import re

from gensim.models import FastText

import logging

In [2]:
# dataset = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1") # wikitext-2-raw-v1
dataset = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1") # 
train = dataset['train']['text']
valid = dataset['validation']['text']
test = dataset['test']['text']

In [3]:

# Step 2: Tokenize the text using regular expressions
def tokenize_with_re(data):
    tokenized_sentences = [
        re.findall(r'\b[a-zA-Z]+\b', sentence.lower()) for sentence in data if sentence.strip()
    ]
    return tokenized_sentences
    

# Tokenize each dataset split
train_tokenized = tokenize_with_re(train)

#!!!!!!!!
train_tokenized = train_tokenized[:len(train_tokenized)//2]


valid_tokenized = tokenize_with_re(valid)
test_tokenized = tokenize_with_re(test)

# Step 3: Train the Word2Vec model
# Combine train, validation, and test data for training
all_tokenized_data = train_tokenized + valid_tokenized + test_tokenized

# What is the longest sentence in the dataset?
max_sentence_len = max([len(sentence) for sentence in all_tokenized_data])
print(f"Max sentence length: {max_sentence_len}")


Max sentence length: 575


In [4]:
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

# Define and train the FastText model
model = FastText(
    sentences=all_tokenized_data,  # Tokenized sentences
    vector_size=150,               # Size of word vectors (embeddings)
    window=15,                     # Context window size
    min_count=1,                   # Minimum frequency for a word to be included in the model
    sg=1,                          # Use CBOW (0) or Skip-gram (1)
    workers=8,                     # Number of CPU cores to use
    epochs=50                      # Number of iterations over the corpus
)

# Save the trained FastText model
model.save("wikitext_small_fasttext.model")

2024-11-17 15:20:54,929 : INFO : collecting all words and their counts
2024-11-17 15:20:54,930 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-11-17 15:20:54,996 : INFO : PROGRESS: at sentence #10000, processed 705394 words, keeping 37689 word types
2024-11-17 15:20:55,035 : INFO : collected 49596 word types from a corpus of 1215298 raw words and 17235 sentences
2024-11-17 15:20:55,035 : INFO : Creating a fresh vocabulary
2024-11-17 15:20:55,087 : INFO : FastText lifecycle event {'msg': 'effective_min_count=1 retains 49596 unique words (100.00% of original 49596, drops 0)', 'datetime': '2024-11-17T15:20:55.087048', 'gensim': '4.3.3', 'python': '3.12.3 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 11:44:52) [Clang 14.0.6 ]', 'platform': 'macOS-14.6.1-arm64-i386-64bit', 'event': 'prepare_vocab'}
2024-11-17 15:20:55,087 : INFO : FastText lifecycle event {'msg': 'effective_min_count=1 leaves 1215298 word corpus (100.00% of original 1215298, drops 0)', '

In [7]:
# Test the model: Find similar words to a given word
similar_words = model.wv.most_similar("belgium", topn=10)
print(similar_words)

[('belgic', 0.6026257872581482), ('wallonia', 0.5879192352294922), ('belgian', 0.5818304419517517), ('belgians', 0.5784717202186584), ('slovakia', 0.5540475845336914), ('flanders', 0.5434573888778687), ('germany', 0.5172135829925537), ('francophonie', 0.50387042760849), ('vieil', 0.5008867979049683), ('france', 0.5001628398895264)]
