In [1]:
%pip install gensim tqdm optuna

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import re
import logging
from tqdm import tqdm
from pathlib import Path
from gensim.models import FastText
from collections import Counter
from pathlib import Path

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

DATA_DIR = Path("./preprocessed")
MODEL_OUTPUT_DIR = Path("./embedding_models")
MODEL_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# FastText Model Parameters
VECTOR_SIZE = 100  # Dimensionality of the word vectors
WINDOW_SIZE = 3    # Context window size
MIN_WORD_COUNT = 2 # Minimum word frequency to consider
SKIP_GRAM = 0      # 1 for Skip-gram, 0 for CBOW. Skip-gram is generally better.
EPOCHS = 10        # Number of training iterations over the corpus
MIN_N_CHAR_GRAM = 3 # Minimum length of char n-grams
MAX_N_CHAR_GRAM = 4 # Maximum length of char n-grams
NEGATIVE = 10 # SGNS

NUM_WORKERS = os.cpu_count() if os.cpu_count() else 4 # Use available CPU cores
NUM_WORKERS = NUM_WORKERS - 3 # prevent excessive load
logging.info(f"Number of workers is {NUM_WORKERS}")

2025-05-15 21:37:05,714 : INFO : Number of workers is 13


In [3]:
def train_decade_model(text_file_path):
    decade_name = text_file_path.name[-9:-4]  # 1930s, 1940s, etc
    logging.info(f"--- Processing decade: {decade_name} ---")

    preprocessed_file_path = DATA_DIR / f"preprocessed_{decade_name}.txt"
    logging.info(f"Loading preprocessed sentences from {preprocessed_file_path}...")
    try:
        with open(preprocessed_file_path, 'r', encoding='utf-8') as f:
            sentences = [line.strip().split() for line in f.readlines()]
        logging.info(f"Loaded {len(sentences)} sentences from preprocessed file.")
    except Exception as e:
        logging.error(f"Could not read preprocessed file {preprocessed_file_path}: {e}")
        return

    # Train the FastText model
    logging.info(f"Training FastText model for {decade_name}...")
    model = FastText(
        sentences=sentences,
        vector_size=VECTOR_SIZE,
        window=WINDOW_SIZE,
        min_count=MIN_WORD_COUNT,
        sg=SKIP_GRAM,
        negative=NEGATIVE,
        epochs=EPOCHS,
        min_n=MIN_N_CHAR_GRAM,
        max_n=MAX_N_CHAR_GRAM,
        workers=NUM_WORKERS
    )

    # Save the trained model
    model_save_path = MODEL_OUTPUT_DIR / f"fasttext_{decade_name}.model"
    try:
        model.save(str(model_save_path))
        logging.info(f"Model for {decade_name} saved to {model_save_path}")
    except Exception as e:
        logging.error(f"Could not save model for {decade_name}: {e}")

    logging.info(f"--- Finished processing for decade: {decade_name} ---")

In [4]:
decade_files = list(DATA_DIR.glob("*.txt"))

In [5]:
largest_corpus, max_word_count = None, -1
for file in decade_files:

    decade_name = file.name[-9:-4]
    preprocessed_file_path = DATA_DIR / f"preprocessed_{decade_name}.txt"
    with open(preprocessed_file_path, 'r', encoding='utf-8') as f:
        text = f.read()
        word_count = len("".join(text.split("\n")).split())
        if word_count > max_word_count:
            largest_corpus = decade_name
            max_word_count = word_count


In [6]:
largest_corpus

'1990s'

In [7]:
import multiprocessing as mp, json, optuna
from gensim.models import FastText
from evaluation import eval_loanword_coverage, eval_alignment

  from .autonotebook import tqdm as notebook_tqdm


In [8]:

def objective(trial):
    params = {
        "vector_size": trial.suggest_categorical("dim", [100, 200, 300]),
        "window":      trial.suggest_categorical("win", [3, 5, 8]),
        "min_count":   trial.suggest_categorical("mc",  [2, 3, 5]),
        "sg":          trial.suggest_categorical("sg",  [0, 1]),
        "negative":    trial.suggest_categorical("neg", [5, 10, 15]),
        "epochs":      trial.suggest_categorical("ep",  [10, 20, 30]),
        "min_n":       trial.suggest_categorical("minn",[2, 3]),
        "max_n":       trial.suggest_categorical("maxn",[4, 6]),
        "sample":      trial.suggest_categorical("samp",[1e-3, 1e-4]),
        "workers":     max(mp.cpu_count()-3, 2),
        "seed":        42,
    }

    preprocessed_file_path = DATA_DIR / f"preprocessed_{largest_corpus}.txt"
    try:
        with open(preprocessed_file_path, 'r', encoding='utf-8') as f:
            sentences = [line.strip().split() for line in f.readlines()]
        logging.info(f"Loaded {len(sentences)} sentences from preprocessed file.")
    except Exception as e:
        logging.error(f"Could not read preprocessed file {preprocessed_file_path}: {e}")
        return

    model = FastText(sentences, **params)       # sentences = one-decade list of tokens
    loan_cov = eval_loanword_coverage(model)    # 0‒1
    align_acc = eval_alignment(model)           # 0‒1
    time_pen  = model.total_train_time / 3600   # hours

    score = (loan_cov * align_acc) / (1 + 0.1 * time_pen)
    trial.set_user_attr("params", params)
    return score

study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///ft_tuning.db",
    study_name="fasttext_tuning",
    load_if_exists=True,
)
study.optimize(objective, n_trials=30, timeout=8*3600)
best = study.best_trial.user_attrs["params"]
with open("best_ft_params.json", "w") as f: json.dump(best, f, indent=2)

[I 2025-05-15 21:37:41,973] Using an existing study with name 'fasttext_tuning' instead of creating a new one.
2025-05-15 21:38:08,598 : INFO : Loaded 18150554 sentences from preprocessed file.
2025-05-15 21:38:08,600 : INFO : collecting all words and their counts
2025-05-15 21:38:08,600 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-05-15 21:38:08,606 : INFO : PROGRESS: at sentence #10000, processed 35274 words, keeping 3562 word types
2025-05-15 21:38:08,612 : INFO : PROGRESS: at sentence #20000, processed 72500 words, keeping 5013 word types
2025-05-15 21:38:08,619 : INFO : PROGRESS: at sentence #30000, processed 112572 words, keeping 6146 word types
2025-05-15 21:38:08,625 : INFO : PROGRESS: at sentence #40000, processed 156750 words, keeping 7136 word types
2025-05-15 21:38:08,632 : INFO : PROGRESS: at sentence #50000, processed 191530 words, keeping 7858 word types
2025-05-15 21:38:08,638 : INFO : PROGRESS: at sentence #60000, processed 225572 wor

In [None]:
decade_files = list(DATA_DIR.glob("*.txt"))
for text_file in sorted(decade_files): 
    train_decade_model(text_file)

logging.info("All decades processed.")

In [None]:
test_model_path = MODEL_OUTPUT_DIR / "fasttext_1940s.model"
if not test_model_path.exists():
    logging.error(f"Test model {test_model_path} not found. Run training first.")

logging.info(f"\n--- Loading and testing model: {test_model_path} ---")
loaded_model = FastText.load(str(test_model_path))

In [None]:
# Find most similar word
word = 'çene'
try:
    similar_words = loaded_model.wv.most_similar(word, topn=10)
    print([t[0] for t in similar_words])
except KeyError:
    logging.info(f"{word} is not in vocabulary of 1930s model (or below min_count).")

In [None]:
# Get vector for a word
try:
    word_vector = loaded_model.wv["cumhuriyet"]
    logging.info(f"Vector for 'cumhuriyet': {word_vector[:5]}...") # Print first 5 dims
except KeyError:
    logging.info("'cumhuriyet' not in vocabulary of 1930s model (or below min_count).")

logging.info(f"Vector for OOV word '{word_vector}': {word_vector[:5]}...")

In [None]:
# FastText can also get vectors for OOV words if their n-grams are known
oov_word = "yepyenişeyler" # A made-up word
oov_vector = loaded_model.wv[oov_word]