In [1]:
!pip install gensim tqdm



In [4]:
import os
import re
import logging
from tqdm import tqdm
from pathlib import Path
from gensim.models import FastText

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# --- Configuration ---
DATA_DIR = Path("./data")
MODEL_OUTPUT_DIR = Path("./embedding_models")
MODEL_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# FastText Model Parameters (adjust as needed)
VECTOR_SIZE = 100  # Dimensionality of the word vectors
WINDOW_SIZE = 5    # Context window size
MIN_WORD_COUNT = 5 # Minimum word frequency to consider
SKIP_GRAM = 1      # 1 for Skip-gram, 0 for CBOW. Skip-gram is generally better.
EPOCHS = 15        # Number of training iterations over the corpus
MIN_N_CHAR_GRAM = 3 # Minimum length of char n-grams
MAX_N_CHAR_GRAM = 6 # Maximum length of char n-grams
NUM_WORKERS = os.cpu_count() if os.cpu_count() else 4 # Use available CPU cores
NUM_WORKERS = NUM_WORKERS // 2 # prevent excessive load

In [5]:
print(NUM_WORKERS)

6


In [6]:
def preprocess_turkish_text(text_lines):
    """
    Preprocesses a list of text lines for Turkish:
    1. Converts to lowercase.
    2. Removes punctuation and digits.
    3. Tokenizes by whitespace.
    Returns a list of lists of tokens (sentences).
    """
    processed_sentences = []
    for line in tqdm(text_lines, desc="Preprocessing text lines"):
        # Convert to lowercase (handles Turkish 'İ'->'i', 'I'->'ı')
        line = line.lower()
        # Remove punctuation and digits - keep letters and spaces
        # This regex keeps Turkish characters (ç, ğ, ı, ö, ş, ü)
        line = re.sub(r'[^\w\sığüşöç]', '', line)  # Remove punctuation except Turkish chars
        line = re.sub(r'\d+', '', line)  # Remove digits

        tokens = line.split()  # Tokenize by whitespace
        if tokens:  # Add sentence only if it's not empty after preprocessing
            processed_sentences.append(tokens)
    
    return processed_sentences

In [7]:
def train_decade_model(text_file_path):
    """
    Trains a FastText model for a single decade's text file.
    """
    decade_name = text_file_path.stem  # e.g., "1930s" from "1930s.txt"
    logging.info(f"--- Processing decade: {decade_name} ---")

    # Read the text file
    try:
        with open(text_file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except Exception as e:
        logging.error(f"Could not read file {text_file_path}: {e}")
        return

    if not lines:
        logging.warning(f"File {text_file_path} is empty. Skipping.")
        return

    # Preprocess the text
    logging.info(f"Preprocessing text for {decade_name}...")
    sentences = preprocess_turkish_text(lines)

    if not sentences:
        logging.warning(f"No processable sentences found in {text_file_path} for {decade_name}. Skipping model training.")
        return

    # Train the FastText model
    logging.info(f"Training FastText model for {decade_name}...")
    model = FastText(
        sentences=sentences,
        vector_size=VECTOR_SIZE,
        window=WINDOW_SIZE,
        min_count=MIN_WORD_COUNT,
        sg=SKIP_GRAM,
        epochs=EPOCHS,
        min_n=MIN_N_CHAR_GRAM,
        max_n=MAX_N_CHAR_GRAM,
        workers=NUM_WORKERS
    )

    # Save the trained model
    model_save_path = MODEL_OUTPUT_DIR / f"fasttext_{decade_name}.model"
    try:
        model.save(str(model_save_path))
        logging.info(f"Model for {decade_name} saved to {model_save_path}")
    except Exception as e:
        logging.error(f"Could not save model for {decade_name}: {e}")

    logging.info(f"--- Finished processing for decade: {decade_name} ---")

In [8]:
DATA_DIR.mkdir(parents=True, exist_ok=True)
# sample_data = {
#     "1930s.txt": [
#         "Türkiye cumhuriyeti yeni bir döneme girdi.",
#         "Ekonomi ve sanayi alanında önemli gelişmeler yaşandı.",
#         "Gazeteler bu haberleri halka duyurdu."
#     ],
#     "1940s.txt": [
#         "İkinci dünya savaşı etkileri sürüyordu.",
#         "Ülkemiz zorlu zamanlardan geçiyordu ama umut vardı.",
#         "Yeni yasalar meclisten geçti."
#     ]
# }

# for filename, content_lines in sample_data.items():
#     if not (DATA_DIR / filename).exists(): # Only create if not exists
#             with open(DATA_DIR / filename, 'w', encoding='utf-8') as f:
#                 for line in content_lines:
#                     f.write(line + "\n")
#             logging.info(f"Created dummy file: {DATA_DIR / filename}")

if not DATA_DIR.is_dir():
    logging.error(f"Data directory not found: {DATA_DIR}")
    logging.error("Please create a 'data' directory and place your decade .txt files (e.g., 1930s.txt) in it.")
    raise FileNotFoundError("Data directory not found")


decade_files = list(DATA_DIR.glob("*.txt"))

if not decade_files:
    logging.warning(f"No .txt files found in {DATA_DIR}. Nothing to process.")
    raise FileNotFoundError("No .txt files found in data directory")

logging.info(f"Found {len(decade_files)} decade files to process: {[f.name for f in decade_files]}")

for text_file in sorted(decade_files): # Sort to process in chronological order
    train_decade_model(text_file)

logging.info("All decades processed.")

2025-05-14 01:39:24,102 : INFO : Found 1 decade files to process: ['1930s.txt']
2025-05-14 01:39:24,103 : INFO : --- Processing decade: 1930s ---
2025-05-14 01:39:28,224 : INFO : Preprocessing text for 1930s...
Preprocessing text lines: 100%|█| 27826829/27826829 [01:24<00:00, 327590.25it/s]
2025-05-14 01:40:53,173 : INFO : Training FastText model for 1930s...
2025-05-14 01:40:53,173 : INFO : collecting all words and their counts
2025-05-14 01:40:53,174 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-05-14 01:40:53,181 : INFO : PROGRESS: at sentence #10000, processed 59372 words, keeping 8392 word types
2025-05-14 01:40:53,188 : INFO : PROGRESS: at sentence #20000, processed 120920 words, keeping 12782 word types
2025-05-14 01:40:53,196 : INFO : PROGRESS: at sentence #30000, processed 175236 words, keeping 17902 word types
2025-05-14 01:40:53,205 : INFO : PROGRESS: at sentence #40000, processed 239785 words, keeping 22981 word types
2025-05-14 01:40:53,21

In [12]:
test_model_path = MODEL_OUTPUT_DIR / "fasttext_1930s.model"
if not test_model_path.exists():
    logging.error(f"Test model {test_model_path} not found. Run training first.")
    exit(1)
logging.info(f"\n--- Loading and testing model: {test_model_path} ---")
loaded_model = FastText.load(str(test_model_path))

2025-05-14 02:11:17,849 : INFO : 
--- Loading and testing model: embedding_models/fasttext_1930s.model ---
2025-05-14 02:11:17,849 : INFO : loading FastText object from embedding_models/fasttext_1930s.model
2025-05-14 02:11:17,909 : INFO : loading wv recursively from embedding_models/fasttext_1930s.model.wv.* with mmap=None
2025-05-14 02:11:17,910 : INFO : loading vectors_vocab from embedding_models/fasttext_1930s.model.wv.vectors_vocab.npy with mmap=None
2025-05-14 02:11:17,933 : INFO : loading vectors_ngrams from embedding_models/fasttext_1930s.model.wv.vectors_ngrams.npy with mmap=None
2025-05-14 02:11:18,080 : INFO : setting ignored attribute vectors to None
2025-05-14 02:11:18,080 : INFO : setting ignored attribute buckets_word to None
2025-05-14 02:11:26,641 : INFO : loading syn1neg from embedding_models/fasttext_1930s.model.syn1neg.npy with mmap=None
2025-05-14 02:11:26,662 : INFO : setting ignored attribute cum_table to None
2025-05-14 02:11:27,972 : INFO : FastText lifecycle e

In [None]:
# Get vector for a word
try:
    word_vector = loaded_model.wv["cumhuriyet"]
    logging.info(f"Vector for 'cumhuriyet': {word_vector[:5]}...") # Print first 5 dims
except KeyError:
    logging.info("'cumhuriyet' not in vocabulary of 1930s model (or below min_count).")

logging.info(f"Vector for OOV word '{oov_word}': {oov_vector[:5]}...")

In [89]:
# Find most similar word
word = 'mühür'
try:
    similar_words = loaded_model.wv.most_similar(word, topn=100)
    print([t[0] for t in similar_words])
except KeyError:
    logging.info(f"{word} is not in vocabulary of 1930s model (or below min_count).")

['mühürii', 'mühürle', 'mühü', 'mühürleri', 'mühürile', 'mühürler', 'mühürlen', 'mührü', 'mühürü', 'mühürün', 'mühürleriyle', 'mühürlerle', 'mühürlene', 'mührün', 'mühürlerile', 'mühürlenen', 'mumu', 'mühürlerin', 'mühüriyle', 'mühürleme', 'mühürlemek', 'muhür', 'mühürledik', 'mühürüyle', 'mühürlemeğe', 'mumun', 'muhur', 'mühürünü', 'mühürünün', 'mühürlerinin', 'mühürcü', 'mühürlenir', 'mührile', 'mühürlemiş', 'mühürlenmek', 'mübür', 'mühürlerini', 'mühürlenmiş', 'mührünü', 'imzalar', 'muhürü', 'mühürletmeğe', 'mühürlenerek', 'temhir', 'mühürlenecek', 'mührünün', 'mühürleyip', 'mühürlenip', 'imzalarla', 'mühürlendiğinde', 'yapıştırılıp', 'mühürleyerek', 'muhuru', 'îmza', 'mühürlenmekle', 'mübürü', 'imzalarda', 'imzalatmağa', 'imza', 'imzalara', 'imzah', 'mühürledikten', 'mühürlenmemiş', 'iimza', 'mühürlendikten', 'damgada', 'balyaları', 'imzadan', 'mahkük', 'yapıştırılmaz', 'mumum', 'mühübe', 'hürle', 'imzalatılarak', 'imzalarlar', 'imzalatılır', 'yapıştırır', 'mühürlenmesi', 'yapıştır

In [None]:
# FastText can also get vectors for OOV words if their n-grams are known
oov_word = "yepyenişeyler" # A made-up word
oov_vector = loaded_model.wv[oov_word]