In [None]:
!pip install gensim tqdm

In [None]:
import os
import re
import logging
from tqdm import tqdm
from pathlib import Path
from gensim.models import FastText
from collections import Counter
from pathlib import Path

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

DATA_DIR = Path("./preprocessed")
MODEL_OUTPUT_DIR = Path("./embedding_models")
MODEL_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# FastText Model Parameters
VECTOR_SIZE = 100  # Dimensionality of the word vectors
WINDOW_SIZE = 5    # Context window size
MIN_WORD_COUNT = 5 # Minimum word frequency to consider
SKIP_GRAM = 1      # 1 for Skip-gram, 0 for CBOW. Skip-gram is generally better.
EPOCHS = 15        # Number of training iterations over the corpus
MIN_N_CHAR_GRAM = 3 # Minimum length of char n-grams
MAX_N_CHAR_GRAM = 6 # Maximum length of char n-grams
NEGATIVE = 5 # SGNS

NUM_WORKERS = os.cpu_count() if os.cpu_count() else 4 # Use available CPU cores
NUM_WORKERS = NUM_WORKERS - 3 # prevent excessive load
logging.info(f"Number of workers is {NUM_WORKERS}")

In [None]:
def train_decade_model(text_file_path):
    decade_name = text_file_path.name[-9:-4]  # 1930s, 1940s, etc
    logging.info(f"--- Processing decade: {decade_name} ---")

    preprocessed_file_path = DATA_DIR / f"preprocessed_{decade_name}.txt"
    logging.info(f"Loading preprocessed sentences from {preprocessed_file_path}...")
    try:
        with open(preprocessed_file_path, 'r', encoding='utf-8') as f:
            sentences = [line.strip().split() for line in f.readlines()]
        logging.info(f"Loaded {len(sentences)} sentences from preprocessed file.")
    except Exception as e:
        logging.error(f"Could not read preprocessed file {preprocessed_file_path}: {e}")
        return

    # Train the FastText model
    logging.info(f"Training FastText model for {decade_name}...")
    model = FastText(
        sentences=sentences,
        vector_size=VECTOR_SIZE,
        window=WINDOW_SIZE,
        min_count=MIN_WORD_COUNT,
        sg=SKIP_GRAM,
        negative=NEGATIVE,
        epochs=EPOCHS,
        min_n=MIN_N_CHAR_GRAM,
        max_n=MAX_N_CHAR_GRAM,
        workers=NUM_WORKERS
    )

    # Save the trained model
    model_save_path = MODEL_OUTPUT_DIR / f"fasttext_{decade_name}.model"
    try:
        model.save(str(model_save_path))
        logging.info(f"Model for {decade_name} saved to {model_save_path}")
    except Exception as e:
        logging.error(f"Could not save model for {decade_name}: {e}")

    logging.info(f"--- Finished processing for decade: {decade_name} ---")

In [None]:
decade_files = list(DATA_DIR.glob("*1940s.txt")) # adjust
for text_file in sorted(decade_files): 
    train_decade_model(text_file)

logging.info("All decades processed.")

In [None]:
test_model_path = MODEL_OUTPUT_DIR / "fasttext_1930s.model"
if not test_model_path.exists():
    logging.error(f"Test model {test_model_path} not found. Run training first.")

logging.info(f"\n--- Loading and testing model: {test_model_path} ---")
loaded_model = FastText.load(str(test_model_path))

In [None]:
# Find most similar word
word = 'kanun'
try:
    similar_words = loaded_model.wv.most_similar(word, topn=10)
    print([t[0] for t in similar_words])
except KeyError:
    logging.info(f"{word} is not in vocabulary of 1930s model (or below min_count).")

In [None]:
# Get vector for a word
try:
    word_vector = loaded_model.wv["cumhuriyet"]
    logging.info(f"Vector for 'cumhuriyet': {word_vector[:5]}...") # Print first 5 dims
except KeyError:
    logging.info("'cumhuriyet' not in vocabulary of 1930s model (or below min_count).")

logging.info(f"Vector for OOV word '{word_vector}': {word_vector[:5]}...")

In [None]:
# FastText can also get vectors for OOV words if their n-grams are known
oov_word = "yepyenişeyler" # A made-up word
oov_vector = loaded_model.wv[oov_word]