In [1]:
from datasets import load_dataset
import re

from gensim.models import Word2Vec

import logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dataset = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1") # wikitext-2-raw-v1
dataset = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1") # 
train = dataset['train']['text']
valid = dataset['validation']['text']
test = dataset['test']['text']

In [17]:

# Step 2: Tokenize the text using regular expressions
def tokenize_with_re(data):
    tokenized_sentences = [
        re.findall(r'\b[a-zA-Z]+\b', sentence.lower()) for sentence in data if sentence.strip()
    ]
    return tokenized_sentences
    

# Tokenize each dataset split
train_tokenized = tokenize_with_re(train)
valid_tokenized = tokenize_with_re(valid)
test_tokenized = tokenize_with_re(test)

# Step 3: Train the Word2Vec model
# Combine train, validation, and test data for training
all_tokenized_data = train_tokenized + valid_tokenized + test_tokenized

# What is the longest sentence in the dataset?
max_sentence_len = max([len(sentence) for sentence in all_tokenized_data])
print(f"Max sentence length: {max_sentence_len}")


Max sentence length: 612


In [18]:
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

# Define and train the Word2Vec model
model = Word2Vec(
    sentences=all_tokenized_data,  # Tokenized sentences
    vector_size=150,               # Size of word vectors (embeddings)
    window=15,                      # Context window size
    min_count=1,                   # Minimum frequency for a word to be included in the model
    sg=1,                          # Use CBOW (0) or Skip-gram (1)
    workers=8,                     # Number of CPU cores to use
    epochs=20                      # Number of iterations over the corpus
)

model.save("wikitext_small_word2vec.model")


2024-10-22 16:26:30,473 : INFO : collecting all words and their counts
2024-10-22 16:26:30,474 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-10-22 16:26:30,544 : INFO : PROGRESS: at sentence #10000, processed 705394 words, keeping 37689 word types
2024-10-22 16:26:30,593 : INFO : PROGRESS: at sentence #20000, processed 1407572 words, keeping 54637 word types
2024-10-22 16:26:30,638 : INFO : collected 66929 word types from a corpus of 2053065 raw words and 29119 sentences
2024-10-22 16:26:30,639 : INFO : Creating a fresh vocabulary
2024-10-22 16:26:30,704 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 66929 unique words (100.00% of original 66929, drops 0)', 'datetime': '2024-10-22T16:26:30.704318', 'gensim': '4.3.3', 'python': '3.12.0 | packaged by Anaconda, Inc. | (main, Oct  2 2023, 12:22:05) [Clang 14.0.6 ]', 'platform': 'macOS-15.0-arm64-arm-64bit', 'event': 'prepare_vocab'}
2024-10-22 16:26:30,704 : INFO : Word2Vec lifecy

In [23]:
# Test the model: Find similar words to a given word
similar_words = model.wv.most_similar("belgium", topn=10)
print(similar_words)

[('wallonia', 0.6794160604476929), ('ultratip', 0.6522932052612305), ('slovakia', 0.6315439343452454), ('wallonian', 0.6246206164360046), ('megacharts', 0.6237694025039673), ('flanders', 0.6141849160194397), ('hitparade', 0.6137528419494629), ('vrt', 0.6078399419784546), ('eurochart', 0.6038889288902283), ('ultratop', 0.6013078689575195)]
