In [3]:
import numpy as np 
import requests
import os
import re

## 1. Download Data
Check if the dataset exists locally, if not, download it from the repository.

In [4]:
# Download the file if it doesn't exist
if not os.path.exists("NUTUK_1.txt"):
    url = "https://raw.githubusercontent.com/mehmetaksoy/Nutuk-Turkce-NLP-Dataset/main/NUTUK_1.txt"
    response = requests.get(url)

    if response.status_code == 200:
        with open("NUTUK_1.txt", "w", encoding="utf-8") as file:
            file.write(response.text)
    else:
        print("Failed to get the file ...")

## 2. Load Data
Read the file and skip the preamble (header information), starting from line 283.

In [5]:
# Read the file and use content starting from line 283
with open("NUTUK_1.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Python lists are 0-indexed, so line 283 is at index 282
start_line_index = 282
nutuk = ""

if len(lines) > start_line_index:
    # Join the lines starting from index 282 to the end
    nutuk = "".join(lines[start_line_index:])
    print(f"Loaded text starting from line {start_line_index + 1} (Original Line 283)")
    print("-" * 30)
    print("Preview of the text:")
    print(nutuk[:200]) # Print first 200 chars to verify
    print("-" * 30)
else:
    print(f"Error: The file has fewer than {start_line_index + 1} lines.")

Loaded text starting from line 283 (Original Line 283)
------------------------------
Preview of the text:
NUTUK 1
(1919-1920) 
Samsun'a çıktığım gün genel vaziyet ve manzara
1919 senesi Mayıs'ının 19. günü Samsun'a çıktım. Genel vaziyet ve manzara: 
Osmanlı Devleti'nin dahil bulunduğu grup, Harbi Umumi'de
------------------------------


## 3. Data Cleaning
Lowercase the text, remove punctuation, and split into tokens (words).

In [6]:
# 1. CLEANING THE TEXT
if nutuk:
    # Convert to lowercase
    nutuk_cleaned = nutuk.lower()

    # Add space around punctuation to preserve them as tokens
    nutuk_cleaned = re.sub(r'([.,!?;])', r' \1 ', nutuk_cleaned)

    # Remove punctuation/special characters (keep distinct Turkish characters if needed, but remove symbols)
    # This regex removes anything that is NOT a word character or whitespace.
    # \w includes alphanumeric characters (and underscores).
    nutuk_cleaned = re.sub(r'[^\w\s.,!?;]', '', nutuk_cleaned)

    # Replace newlines with spaces to treat the whole text as a continuous stream
    nutuk_cleaned = nutuk_cleaned.replace('\n', ' ')

    # Split into words (tokens)
    words = nutuk_cleaned.split()

    print(f"Total words found: {len(words)}")
    print("-" * 30)
    print("First 50 words:")
    print(words[:50])
    print("-" * 30)

## 4. Build Markov Model
Create the vocabulary and the transition matrix (counts of next words).

In [9]:
# Define N-Gram Markov Chain Class
class MarkovChain:
    def __init__(self, words, max_order=3):
        self.words = words
        self.max_order = max_order
        self.vocab = sorted(list(set(self.words)))
        self.vocab_size = len(self.vocab)
        self.word_to_index = {word: i for i, word in enumerate(self.vocab)}
        self.index_to_word = {i: word for i, word in enumerate(self.vocab)}
        self.models = {}
        self._build_model()

    def _build_model(self):
        # Build models for orders 1 to max_order
        for n in range(1, self.max_order + 1):
            self.models[n] = {}
            for i in range(len(self.words) - n):
                state = tuple(self.words[i:i+n])
                next_word = self.words[i+n]
                if state not in self.models[n]:
                    self.models[n][state] = {}
                if next_word not in self.models[n][state]:
                    self.models[n][state][next_word] = 0
                self.models[n][state][next_word] += 1
        print(f"Models built up to order {self.max_order}")

    def _get_transition_probs(self, history, alpha=0.001):
         # Backoff strategy
        found_transitions = None
        for n in range(min(len(history), self.max_order), 0, -1):
            state = tuple(history[-n:])
            if state in self.models[n]:
                found_transitions = self.models[n][state]
                break

        if found_transitions is None:
             probs = np.ones(self.vocab_size) / self.vocab_size
             return probs, None

        # Calculate probs with smoothing
        probs = np.full(self.vocab_size, alpha)
        if found_transitions:
            for next_word, count in found_transitions.items():
                if next_word in self.word_to_index:
                    probs[self.word_to_index[next_word]] += count

        return probs / np.sum(probs), found_transitions

    def get_next_word(self, history, temperature=1.0, alpha=0.001):
        probs, _ = self._get_transition_probs(history, alpha)
        
        # Handle very low temperature (deterministic / argmax) to avoid numerical instability
        if temperature < 0.05:
            next_word_idx = np.argmax(probs)
            return self.index_to_word[next_word_idx]

        if temperature != 1.0:
            probs = np.power(probs, 1.0 / temperature)
            probs = probs / np.sum(probs)

        next_idx = np.random.choice(range(self.vocab_size), p=probs)
        return self.index_to_word[next_idx]

    def generate_text(self, start_text, length=20, temperature=1.0, alpha=0.001):
        # Preprocess start text to respect new punctuation tokens
        processed_start = start_text.lower()
        processed_start = re.sub(r'([.,!?;])', r' \1 ', processed_start)
        processed_start = re.sub(r'[^\w\s.,!?;]', '', processed_start)
        processed_start = processed_start.replace('\n', ' ')
        current_history = processed_start.split()
        
        if not current_history:
             return ""
        result = list(current_history)
        for _ in range(length):
            next_word = self.get_next_word(current_history, temperature, alpha)
            result.append(next_word)
            current_history.append(next_word)
        return " ".join(result)

print("MarkovChain Class defined successfully.")

## 5. Prediction Function
Implement `get_next_word` with **Laplace Smoothing** and **Temperature**.

In [10]:
# Initialize Model
max_order = 3
model = MarkovChain(words, max_order=max_order)
print(f"Markov Model initiated with Order {max_order}.")

## 6. Generate Text
Let's test the model with different temperatures.

In [11]:
start_text = "millet ve"

print(f"--- Temp 0.5 (Balanced) ---")
print(model.generate_text(start_text, length=15, temperature=0.5))

print(f"\n--- Temp 0.9 (Creative) ---")
print(model.generate_text(start_text, length=15, temperature=0.9))

## 7. Visualize Transition Probabilities
We use bar charts to show the most likely next-words for a few selected terms. This makes filtering easier to interpret than a sparse heatmap.

In [20]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

def visualize_transitions(history, ax=None):
    # Use the model's logic to get probs directly
    if isinstance(history, str):
        history = history.lower().split()
        # Handle punctuation in visualization context too if needed
        history = re.sub(r'([.,!?;])', r' \1 ', history)
        history = re.sub(r'[^\w\s.,!?;]', '', history)
        history = history.split()
        
    probs, _ = model._get_transition_probs(history)
    
    top_indices = np.argsort(probs)[-5:][::-1]
    top_words = [model.index_to_word[idx] for idx in top_indices]
    top_probs = [probs[idx] for idx in top_indices]
    
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 5))
        
    sns.barplot(x=top_probs, y=top_words, ax=ax, palette="viridis")
    context_str = " ".join(history[-3:]) # Show last few words
    ax.set_title(f"Next words after: '...{context_str}'")
    ax.set_xlabel("Probability")

# Demo
contexts = ["millet", "türk milleti", "büyük millet", "ordu ve", "vatan"]
fig, axes = plt.subplots(len(contexts), 1, figsize=(10, 3 * len(contexts)))
plt.subplots_adjust(hspace=0.5)

for i, ctx in enumerate(contexts):
    ax = axes[i] if len(contexts) > 1 else axes
    visualize_transitions(ctx, ax=ax)

plt.show()