In [None]:
!pip install gensim tqdm zemberek-python jpype1

In [1]:
import os
import re
import logging
from tqdm import tqdm
from pathlib import Path
from gensim.models import FastText
from zemberek import (
    TurkishSpellChecker,
    TurkishSentenceNormalizer,
    TurkishSentenceExtractor,
    TurkishMorphology,
    TurkishTokenizer
)
from collections import Counter
from pathlib import Path

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# --- Configuration ---jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.class.path=/path/to/zemberek.jar")

DATA_DIR = Path("./data")
MODEL_OUTPUT_DIR = Path("./embedding_models")
MODEL_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# FastText Model Parameters
VECTOR_SIZE = 100  # Dimensionality of the word vectors
WINDOW_SIZE = 5    # Context window size
MIN_WORD_COUNT = 5 # Minimum word frequency to consider
SKIP_GRAM = 1      # 1 for Skip-gram, 0 for CBOW. Skip-gram is generally better.
EPOCHS = 15        # Number of training iterations over the corpus
MIN_N_CHAR_GRAM = 3 # Minimum length of char n-grams
MAX_N_CHAR_GRAM = 6 # Maximum length of char n-grams
NEGATIVE = 5 # SGNS

NUM_WORKERS = os.cpu_count() if os.cpu_count() else 4 # Use available CPU cores
NUM_WORKERS = NUM_WORKERS - 3 # prevent excessive load

In [2]:
logging.info(f"Number of workers is {NUM_WORKERS}")

2025-05-14 21:25:47,542 - root - INFO
Msg: Number of workers is 9



In [3]:
# stopword list from https://github.com/ahmetax/trstop/blob/master/dosyalar/turkce-stop-words
STOPWORDS = [
    'acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'bile', 'bir', 'biraz', 'birçoğu',
    'birçok', 'birisi', 'birkaç', 'birşey', 'biz', 'bizden', 'bize', 'bizi', 'bizim', 'bu',
    'buna', 'bundan', 'bunlar', 'bunları', 'bunların', 'bunu', 'bunun', 'burada', 'böyle',
    'böylece', 'da', 'daha', 'de', 'defa', 'diye', 'eğer', 'en', 'gibi', 'hem', 'hep',
    'hepsi', 'her', 'herkes', 'hiç', 'hiçbir', 'için', 'ile', 'ise', 'içinde', 'kadar', 'ki',
    'kim', 'kimse', 'mı', 'mi', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nerde', 'nereye', 'niye',
    'niçin', 'o', 'olan', 'olarak', 'oldu', 'olduğu', 'olmak', 'olmaz', 'olsun', 'on', 'ona',
    'ondan', 'onlar', 'onlardan', 'onları', 'onların', 'onu', 'onun', 'orada', 'sanki',
    'sadece', 'sen', 'senden', 'sende', 'seni', 'senin', 'siz', 'sizden', 'size', 'sizi',
    'sizin', 'şey', 'şu', 'şuna', 'şunda', 'şundan', 'şunları', 'şunlar', 'şunu', 'şunun',
    'ta', 'tamam', 'tüm', 've', 'veya', 'ya', 'yani'
]

In [4]:
try:
    morphology = TurkishMorphology.create_with_defaults()
    normalizer = TurkishSentenceNormalizer(morphology)
    spell_checker = TurkishSpellChecker(morphology)
    extractor = TurkishSentenceExtractor()
    tokenizer = TurkishTokenizer.DEFAULT
    logging.info("Zemberek tools initialized successfully")
except Exception as e:
    logging.error(f"Failed to initialize Zemberek: {e}")
    morphology = normalizer = spell_checker = extractor = tokenizer = None

2025-05-14 21:25:50,808 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 3.251668691635132

2025-05-14 21:26:00,134 - root - INFO
Msg: Zemberek tools initialized successfully



In [14]:
def preprocess_turkish_text(text_lines):
    processed_sentences = []
    vocab_counts = Counter()

    for line_num, line in enumerate(tqdm(text_lines, desc="Preprocessing")):
        if not line.strip():
            continue
        
        sentences = extractor.from_paragraph(line.strip())
        unwanted_token_types = ['Punctuation', 'Emoticon', 'UnknownWord', 'Number', 'SpaceTab', 'NewLine', 'RomanNumeral', 'PercentNumeral', 'Time', 'Date', 'URL', 'Email', 'HashTag', 'Mention', 'MetaTag', 'Emoji', 'Emoticon', 'UnknownWord', 'Unknown']
        for sentence in sentences:
            # Tokenization
            tokens = tokenizer.tokenize(sentence)
            token_list = [token.content for token in tokens if token.type_.name not in unwanted_token_types and token.content.lower() not in STOPWORDS]
            if not token_list:
                continue
            
            normalized_text = normalizer.normalize(' '.join(token_list))
            normalized_tokens = tokenizer.tokenize(normalized_text)
            normalized_tokens = [token.content.lower() for token in normalized_tokens if token.type_.name not in unwanted_token_types and token.content.lower() not in STOPWORDS]
            
            # Spell correction
            corrected_tokens = []
            for token in normalized_tokens:
                results = morphology.analyze(token)
                if not results or len(str(results)) < 5:
                    suggestions = spell_checker.suggest_for_word(token)
                    token = suggestions[0] if suggestions else token
                corrected_tokens.append(token)

            # Lemmatization
            lemmatized_tokens = []
            for token in corrected_tokens:
                results = morphology.analyze(token)
                largest_lemma = ''
                largest_lemma_length = 0
                for result in results:
                    analysis_str = str(result)
                    if analysis_str.startswith('['):
                        lemma_section = analysis_str.split(']')[0]
                        lemma = lemma_section.split(':')[0].lstrip('[')
                    else:
                        lemma = analysis_str.split(':')[0]
                    if (len(lemma) > largest_lemma_length):
                        largest_lemma = lemma
                if largest_lemma:
                    lemmatized_tokens.append(largest_lemma.lower())

            final_tokens = [token for token in lemmatized_tokens if token not in STOPWORDS]

            # Update vocabulary counts
            if final_tokens:
                vocab_counts.update(final_tokens)
                processed_sentences.append(final_tokens)

    # Vocabulary pruning
    shared_vocab = {word for word, count in vocab_counts.items() if count >= MIN_WORD_COUNT}
    final_sentences = []
    for sentence in processed_sentences:
        final_sentence = [word if word in shared_vocab else '<UNK>' for word in sentence]
        if final_sentence and any(word != '<UNK>' for word in final_sentence):
            final_sentences.append(final_sentence)

    logging.info(f"Processed {len(final_sentences)} sentences with {len(shared_vocab)} unique words")
    decade_name = 'test'
    preprocessed_file_path = MODEL_OUTPUT_DIR / f"preprocessed_{decade_name}.txt"
    try:
        with open(preprocessed_file_path, 'w', encoding='utf-8') as f:
            for sentence in processed_sentences:
                f.write(' '.join(sentence) + '\n')
        logging.info(f"Preprocessed sentences saved to {preprocessed_file_path}")
    except Exception as e:
        logging.error(f"Could not save preprocessed sentences for {decade_name}: {e}")
        
    return processed_sentences

Preprocessing: 100%|█████████████████████████████| 2/2 [00:00<00:00, 117.13it/s]

2025-05-14 21:35:20,932 - root - INFO
Msg: Processed 0 sentences with 0 unique words

2025-05-14 21:35:20,933 - root - INFO
Msg: Preprocessed sentences saved to embedding_models/preprocessed_test.txt

[['bugün', 'dün', 'iyi', 'yapmak', 'büyümek', 'değer'], ['mut', 'sırrı', 'mut', 'geçer']]





In [15]:
def preprocess_all_decade_files(decade_files):
    for file_path in decade_files:
        decade_name = file_path.stem
        logging.info(f"--- Preprocessing for {decade_name} ---")
        
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
        except Exception as e:
            logging.error(f"Could not read {file_path}: {e}")
            continue

        if not lines:
            logging.warning(f"{file_path} is empty. Skipping.")
            continue

        processed_sentences = preprocess_turkish_text(lines)

        if not processed_sentences:
            logging.warning(f"No valid sentences found in {file_path}. Skipping save.")
            continue

        save_path = MODEL_OUTPUT_DIR / f"preprocessed_{decade_name}.txt"
        try:
            with open(save_path, 'w', encoding='utf-8') as out_f:
                for sentence in processed_sentences:
                    out_f.write(' '.join(sentence) + '\n')
            logging.info(f"Saved preprocessed file: {save_path}")
        except Exception as e:
            logging.error(f"Could not save preprocessed file {save_path}: {e}")

In [6]:
def train_decade_model(text_file_path):
    decade_name = text_file_path.stem  # 1930s, 1940s, etc
    logging.info(f"--- Processing decade: {decade_name} ---")

    preprocessed_file_path = MODEL_OUTPUT_DIR / f"preprocessed_{decade_name}.txt"
    if os.path.exists(preprocessed_file_path):
        logging.info(f"Loading preprocessed sentences from {preprocessed_file_path}...")
        try:
            with open(preprocessed_file_path, 'r', encoding='utf-8') as f:
                sentences = [line.strip().split() for line in f.readlines()]
            logging.info(f"Loaded {len(sentences)} sentences from preprocessed file.")
        except Exception as e:
            logging.error(f"Could not read preprocessed file {preprocessed_file_path}: {e}")
            return
    else:
        try:
            with open(text_file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
        except Exception as e:
            logging.error(f"Could not read file {text_file_path}: {e}")
            return

        if not lines:
            logging.warning(f"File {text_file_path} is empty. Skipping.")
            return

        logging.info(f"Preprocessing text for {decade_name}...")
        sentences = preprocess_turkish_text(lines)
        if not sentences:
            logging.warning(f"No processable sentences found in {text_file_path} for {decade_name}. Skipping model training.")
            return

    # Train the FastText model
    logging.info(f"Training FastText model for {decade_name}...")
    model = FastText(
        sentences=sentences,
        vector_size=VECTOR_SIZE,
        window=WINDOW_SIZE,
        min_count=MIN_WORD_COUNT,
        sg=SKIP_GRAM,
        negative=NEGATIVE,
        epochs=EPOCHS,
        min_n=MIN_N_CHAR_GRAM,
        max_n=MAX_N_CHAR_GRAM,
        workers=NUM_WORKERS
    )

    # Save the trained model
    model_save_path = MODEL_OUTPUT_DIR / f"fasttext_{decade_name}.model"
    try:
        model.save(str(model_save_path))
        logging.info(f"Model for {decade_name} saved to {model_save_path}")
    except Exception as e:
        logging.error(f"Could not save model for {decade_name}: {e}")

    logging.info(f"--- Finished processing for decade: {decade_name} ---")


In [16]:
DATA_DIR.mkdir(parents=True, exist_ok=True)

if not DATA_DIR.is_dir():
    logging.error(f"Data directory not found: {DATA_DIR}")
    logging.error("Please create a 'data' directory and place your decade .txt files (e.g., 1930s.txt) in it.")
    raise FileNotFoundError("Data directory not found")


decade_files = list(DATA_DIR.glob("*.txt"))
sort(decade_files)

if not decade_files:
    logging.warning(f"No .txt files found in {DATA_DIR}. Nothing to process.")
    raise FileNotFoundError("No .txt files found in data directory")

logging.info(f"Found {len(decade_files)} decade files to process: {[f.name for f in decade_files]}")

preprocess_all_decade_files(decade_files)

2025-05-14 21:42:49,620 - root - INFO
Msg: Found 7 decade files to process: ['1940s.txt', '1950s.txt', '1930s.txt', '1980s.txt', '1970s.txt', '1960s.txt', '1990s.txt']

2025-05-14 21:42:49,620 - root - INFO
Msg: --- Preprocessing for 1940s ---



Preprocessing:   0%|                 | 10504/4283897 [01:05<7:21:24, 161.35it/s]


KeyboardInterrupt: 

In [None]:
for text_file in sorted(decade_files): 
    train_decade_model(text_file)

logging.info("All decades processed.")

In [None]:
test_model_path = MODEL_OUTPUT_DIR / "fasttext_1930s.model"
if not test_model_path.exists():
    logging.error(f"Test model {test_model_path} not found. Run training first.")
    exit(1)
logging.info(f"\n--- Loading and testing model: {test_model_path} ---")
loaded_model = FastText.load(str(test_model_path))

In [None]:
# Get vector for a word
try:
    word_vector = loaded_model.wv["cumhuriyet"]
    logging.info(f"Vector for 'cumhuriyet': {word_vector[:5]}...") # Print first 5 dims
except KeyError:
    logging.info("'cumhuriyet' not in vocabulary of 1930s model (or below min_count).")

logging.info(f"Vector for OOV word '{oov_word}': {oov_vector[:5]}...")

In [None]:
# Find most similar word
word = 'mühür'
try:
    similar_words = loaded_model.wv.most_similar(word, topn=100)
    print([t[0] for t in similar_words])
except KeyError:
    logging.info(f"{word} is not in vocabulary of 1930s model (or below min_count).")

In [None]:
# FastText can also get vectors for OOV words if their n-grams are known
oov_word = "yepyenişeyler" # A made-up word
oov_vector = loaded_model.wv[oov_word]