In [None]:
import numpy as np 
import requests
import os
import re

## 1. Download Data
Check if the dataset exists locally, if not, download it from the repository.

In [None]:
# Download the file if it doesn't exist
if not os.path.exists("NUTUK_1.txt"):
    url = "https://raw.githubusercontent.com/mehmetaksoy/Nutuk-Turkce-NLP-Dataset/main/NUTUK_1.txt"
    response = requests.get(url)

    if response.status_code == 200:
        with open("NUTUK_1.txt", "w", encoding="utf-8") as file:
            file.write(response.text)
    else:
        print("Failed to get the file ...")

## 2. Load Data
Read the file and skip the preamble (header information), starting from line 283.

In [None]:
# Read the file and use content starting from line 283
with open("NUTUK_1.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Python lists are 0-indexed, so line 283 is at index 282
start_line_index = 282
nutuk = ""

if len(lines) > start_line_index:
    # Join the lines starting from index 282 to the end
    nutuk = "".join(lines[start_line_index:])
    print(f"Loaded text starting from line {start_line_index + 1} (Original Line 283)")
    print("-" * 30)
    print("Preview of the text:")
    print(nutuk[:200]) # Print first 200 chars to verify
    print("-" * 30)
else:
    print(f"Error: The file has fewer than {start_line_index + 1} lines.")

## 3. Data Cleaning
Lowercase the text, remove punctuation, and split into tokens (words).

In [None]:
# 1. CLEANING THE TEXT
if nutuk:
    # Convert to lowercase
    nutuk_cleaned = nutuk.lower()

    # Remove punctuation/special characters (keep distinct Turkish characters if needed, but remove symbols)
    # This regex removes anything that is NOT a word character or whitespace.
    # \w includes alphanumeric characters (and underscores).
    nutuk_cleaned = re.sub(r'[^\w\s]', '', nutuk_cleaned)

    # Replace newlines with spaces to treat the whole text as a continuous stream
    nutuk_cleaned = nutuk_cleaned.replace('\n', ' ')

    # Split into words (tokens)
    words = nutuk_cleaned.split()

    print(f"Total words found: {len(words)}")
    print("-" * 30)
    print("First 50 words:")
    print(words[:50])
    print("-" * 30)