<a href="https://colab.research.google.com/github/celaltrk/Persian-Turkish-NLP/blob/main/Turkish_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gensim==4.3.2

Collecting gensim==4.3.2
  Downloading gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.3 kB)
Downloading gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.3.2


In [2]:
import os
import re
import logging
from tqdm import tqdm
from pathlib import Path
from gensim.models import FastText
from google.colab import drive

drive.mount('/content/drive')

# --- Configuration ---
DATA_DIR = Path("/content/drive/MyDrive/turkish_data")
MODEL_OUTPUT_DIR = Path("/content/drive/MyDrive/fasttext_models")
MODEL_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# FastText Model Parameters (adjust as needed)
VECTOR_SIZE = 100  # Dimensionality of the word vectors
WINDOW_SIZE = 5    # Context window size
MIN_WORD_COUNT = 5 # Minimum word frequency to consider
SKIP_GRAM = 1      # 1 for Skip-gram, 0 for CBOW. Skip-gram is generally better.
EPOCHS = 15        # Number of training iterations over the corpus
MIN_N_CHAR_GRAM = 3 # Minimum length of char n-grams
MAX_N_CHAR_GRAM = 6 # Maximum length of char n-grams
NUM_WORKERS = os.cpu_count() if os.cpu_count() else 4 # Use available CPU cores

ImportError: cannot import name 'triu' from 'scipy.linalg' (/usr/local/lib/python3.11/dist-packages/scipy/linalg/__init__.py)

In [None]:
def preprocess_turkish_text(text_lines):
    """
    Preprocesses a list of text lines for Turkish:
    1. Converts to lowercase.
    2. Removes punctuation and digits.
    3. Tokenizes by whitespace.
    Returns a list of lists of tokens (sentences).
    """
    processed_sentences = []
    for line in tqdm(text_lines, desc="Preprocessing text lines"):
        # Convert to lowercase (handles Turkish 'İ'->'i', 'I'->'ı')
        line = line.lower()
        # Remove punctuation and digits - keep letters and spaces
        # This regex keeps Turkish characters (ç, ğ, ı, ö, ş, ü)
        line = re.sub(r'[^\w\sığüşöç]', '', line) # Remove punctuation except Turkish chars
        line = re.sub(r'\d+', '', line) # Remove digits

        tokens = line.split() # Tokenize by whitespace
        if tokens: # Add sentence only if it's not empty after preprocessing
            processed_sentences.append(tokens)
    return processed_sentences

In [None]:
def train_decade_model(text_file_path):
    """
    Trains a FastText model for a single decade's text file.
    """
    decade_name = text_file_path.stem  # e.g., "1930s" from "1930s.txt"
    logging.info(f"--- Processing decade: {decade_name} ---")

    # Read the text file
    try:
        with open(text_file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except Exception as e:
        logging.error(f"Could not read file {text_file_path}: {e}")
        return

    if not lines:
        logging.warning(f"File {text_file_path} is empty. Skipping.")
        return

    # Preprocess the text
    logging.info(f"Preprocessing text for {decade_name}...")
    sentences = preprocess_turkish_text(lines)

    if not sentences:
        logging.warning(f"No processable sentences found in {text_file_path} for {decade_name}. Skipping model training.")
        return

    # Train the FastText model
    logging.info(f"Training FastText model for {decade_name}...")
    model = FastText(
        sentences=sentences,
        vector_size=VECTOR_SIZE,
        window=WINDOW_SIZE,
        min_count=MIN_WORD_COUNT,
        sg=SKIP_GRAM,
        epochs=EPOCHS,
        min_n=MIN_N_CHAR_GRAM,
        max_n=MAX_N_CHAR_GRAM,
        workers=NUM_WORKERS
    )

    # Save the trained model
    model_save_path = MODEL_OUTPUT_DIR / f"fasttext_{decade_name}.model"
    try:
        model.save(str(model_save_path))
        logging.info(f"Model for {decade_name} saved to {model_save_path}")
    except Exception as e:
        logging.error(f"Could not save model for {decade_name}: {e}")

    logging.info(f"--- Finished processing for decade: {decade_name} ---")

In [None]:
DATA_DIR.mkdir(parents=True, exist_ok=True)
# sample_data = {
#     "1930s.txt": [
#         "Türkiye cumhuriyeti yeni bir döneme girdi.",
#         "Ekonomi ve sanayi alanında önemli gelişmeler yaşandı.",
#         "Gazeteler bu haberleri halka duyurdu."
#     ],
#     "1940s.txt": [
#         "İkinci dünya savaşı etkileri sürüyordu.",
#         "Ülkemiz zorlu zamanlardan geçiyordu ama umut vardı.",
#         "Yeni yasalar meclisten geçti."
#     ]
# }

# for filename, content_lines in sample_data.items():
#     if not (DATA_DIR / filename).exists(): # Only create if not exists
#             with open(DATA_DIR / filename, 'w', encoding='utf-8') as f:
#                 for line in content_lines:
#                     f.write(line + "\n")
#             logging.info(f"Created dummy file: {DATA_DIR / filename}")

if not DATA_DIR.is_dir():
    logging.error(f"Data directory not found: {DATA_DIR}")
    logging.error("Please create a 'data' directory and place your decade .txt files (e.g., 1930s.txt) in it.")
    raise FileNotFoundError("Data directory not found")


decade_files = list(DATA_DIR.glob("*.txt"))

if not decade_files:
    logging.warning(f"No .txt files found in {DATA_DIR}. Nothing to process.")
    raise FileNotFoundError("No .txt files found in data directory")

logging.info(f"Found {len(decade_files)} decade files to process: {[f.name for f in decade_files]}")

for text_file in sorted(decade_files): # Sort to process in chronological order
    train_decade_model(text_file)

logging.info("All decades processed.")

In [None]:
# --- Example: How to load and use a trained model ---
# This is for demonstration after training is complete.
# You would run this part separately or after training.

# test_model_path = MODEL_OUTPUT_DIR / "fasttext_1930s.model"
# if test_model_path.exists():
#     logging.info(f"\n--- Loading and testing model: {test_model_path} ---")
#     loaded_model = FastText.load(str(test_model_path))

#     # Get vector for a word
#     try:
#         word_vector = loaded_model.wv["cumhuriyet"]
#         logging.info(f"Vector for 'cumhuriyet': {word_vector[:5]}...") # Print first 5 dims
#     except KeyError:
#         logging.info("'cumhuriyet' not in vocabulary of 1930s model (or below min_count).")

#     # Find most similar words
#     try:
#         similar_words = loaded_model.wv.most_similar("ekonomi", topn=5)
#         logging.info(f"Words similar to 'ekonomi': {similar_words}")
#     except KeyError:
#         logging.info("'ekonomi' not in vocabulary of 1930s model (or below min_count).")

#     # FastText can also get vectors for OOV words if their n-grams are known
#     oov_word = "yepyenişeyler" # A made-up word
#     oov_vector = loaded_model.wv[oov_word]
#     logging.info(f"Vector for OOV word '{oov_word}': {oov_vector[:5]}...")
# else:
#     logging.info(f"Test model {test_model_path} not found. Run training first.")
# --- End of Example Usage ---