In [None]:
!pip install telethon git+https://github.com/dontbeidle/kaalin-python.git --quiet

import re, asyncio, nest_asyncio, pandas as pd, os
from telethon import TelegramClient
from telethon.tl.functions.messages import GetHistoryRequest
from kaalin.converter import cyrillic2latin

api_id = ***
api_hash = ***
phone = ***
channel_username = "paziyletuz"
fetch_limit = 10000
target_sentences = 400


cyr_raw = "Karakalpak_Cyrillic_400.txt"
lat_raw = "Karakalpak_Latin_400.txt"
cyr_clean = "Karakalpak_Cyrillic_400_CLEAN.txt"
lat_clean = "Karakalpak_Latin_400_CLEAN.txt"


def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F300-\U0001F5FF"
        "\U0001F600-\U0001F64F"
        "\U0001F680-\U0001F6FF"
        "\U0001F700-\U0001F77F"
        "\U0001F780-\U0001F7FF"
        "\U0001F800-\U0001F8FF"
        "\U0001F900-\U0001F9FF"
        "\U0001FA00-\U0001FAFF"
        "\U0001FB00-\U0001FBFF"
        "\u2600-\u26FF"
        "\u2700-\u27BF"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub("", text)


def clean_text(text):
    if not text:
        return None
    text = remove_emojis(text)
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"(@\w+)|#\w+", "", text)
    bad_words = ["instagram", "telegram", "facebook", "youtube", 
                 "obuna", "–ø–æ–¥–ø–∏—Å", "follow", ".uz", ".en", ".kr", "t.me"]
    for w in bad_words:
        text = re.sub(w, "", text, flags=re.IGNORECASE)
    text = re.sub(r"\s+", " ", text).strip()
    if len(text.split()) < 3:
        return None
    return text


async def fetch_messages():
    client = TelegramClient("karakalpak_session", api_id, api_hash)
    await client.start(phone)
    print(f"‚úÖ Signed in successfully as {phone}")
    print(f"üì• Fetching messages from @{channel_username} ...")

    messages, offset_id = [], 0

    while len(messages) < fetch_limit:
        history = await client(GetHistoryRequest(
            peer=channel_username,
            offset_id=offset_id,
            offset_date=None,
            add_offset=0,
            limit=100,
            max_id=0,
            min_id=0,
            hash=0
        ))
        if not history.messages:
            break

        for message in history.messages:
            if message.message:
                clean_msg = clean_text(message.message)
                if clean_msg:
                    messages.append(clean_msg)
            if len(messages) >= fetch_limit:
                break

        offset_id = history.messages[-1].id

        if len(messages) >= fetch_limit:
            break

    await client.disconnect()
    return messages


def clean_file(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as f:
        lines = [l.strip() for l in f if l.strip()]
    cleaned = []
    for line in lines:
        line = remove_emojis(line)
        line = re.sub(r"[¬´¬ª‚Äú‚Äù\"'‚Äô‚Ä¢‚Äî‚Äì‚Ä¶]", "", line)
        line = re.sub(r"\s+", " ", line).strip()
        if len(line.split()) >= 4:
            cleaned.append(line)
    cleaned = list(dict.fromkeys(cleaned))[:target_sentences]
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("\n".join(cleaned))
    print(f"‚úÖ Saved {len(cleaned)} lines ‚Üí {output_path}")


async def main():
    msgs = await fetch_messages()
    print(f"Fetched {len(msgs)} messages total.")

    pd.Series(msgs).to_csv(cyr_raw, index=False, header=False, encoding="utf-8")
    print(f"üíæ Saved raw Cyrillic ‚Üí {cyr_raw}")

    converted = [cyrillic2latin(line) for line in msgs]
    pd.Series(converted).to_csv(lat_raw, index=False, header=False, encoding="utf-8")
    print(f"üíæ Saved raw Latin ‚Üí {lat_raw}")

    clean_file(cyr_raw, cyr_clean)
    clean_file(lat_raw, lat_clean)
    print("üéâ DONE. Final cleaned dataset saved.")


nest_asyncio.apply()
try:
    asyncio.get_running_loop()
except RuntimeError:
    asyncio.run(main())
else:
    await main()
