In [None]:
import pandas as pd

unseen = pd.read_csv("../data/unseen.csv")
seen = pd.concat(
    [pd.read_csv("../data/train.csv"), pd.read_csv("../data/validation.csv")]
)[["comment_id", "text_original"]]
unseen = unseen[["comment_id", "text_original"]]
df = pd.concat([unseen, seen])
df = df.drop_duplicates(subset=["comment_id"])

In [None]:
import re

KZ_LETTERS = set("әғқңөұүіӘҒҚҢӨҰҮІ")


def is_kazakh_heuristic(text: str, threshold: float = 0.02) -> bool:
    """
    threshold = доля 'казахских' букв среди всех букв.
    Если >= threshold — считаем текст казахским.
    """
    if not isinstance(text, str):
        text = str(text)

    letters = [ch for ch in text if ch.isalpha()]
    if not letters:
        return False

    kz_count = sum(1 for ch in letters if ch in KZ_LETTERS)
    ratio = kz_count / len(letters)

    return ratio >= threshold

In [4]:
df["is_kazakh_heuristic"] = df["text_original"].apply(is_kazakh_heuristic)

In [None]:
KZ_TO_RU_MAP = str.maketrans(
    {
        "ә": "а",
        "ғ": "г",
        "қ": "к",
        "ң": "н",
        "ө": "о",
        "ұ": "у",
        "ү": "у",
        "і": "и",
        "Ә": "А",
        "Ғ": "Г",
        "Қ": "К",
        "Ң": "Н",
        "Ө": "О",
        "Ұ": "У",
        "Ү": "У",
        "І": "И",
    }
)


def kz_to_ru_approx(text: str) -> str:
    if not isinstance(text, str):
        text = str(text)
    return text.translate(KZ_TO_RU_MAP)

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Отбираем "казахские" и "русские" тексты
df_kz = df[df["is_kazakh_heuristic"]].copy()
df_ru = df[~df["is_kazakh_heuristic"]].copy()

# Можно чуть подсэмплить, чтобы классы были сопоставимы по размеру
# например, ограничить до N примеров на класс
N_KZ = min(len(df_kz), 50_000)
N_RU = min(len(df_ru), 50_000)

df_kz = df_kz.sample(N_KZ, random_state=42)
df_ru = df_ru.sample(N_RU, random_state=42)

df_kz["lang"] = "kk"
df_ru["lang"] = "ru"

df_lang = pd.concat([df_kz, df_ru], ignore_index=True)
df_lang = df_lang.dropna(subset=["text_original"])

In [7]:
# Создаём дополнительную копию казахских текстов без спецбукв
df_kz_aug = df_kz.copy()
df_kz_aug["text_original"] = df_kz_aug["text_original"].apply(kz_to_ru_approx)
# тот же label
df_lang = pd.concat([df_lang, df_kz_aug], ignore_index=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

X = df_lang["text_original"].astype(str)
y = df_lang["lang"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipeline_lang = Pipeline(
    [
        (
            "tfidf",
            TfidfVectorizer(
                analyzer="char",
                ngram_range=(3, 5),
                min_df=5,  # можно тюнить
                max_features=200_000,  # можно уменьшить, если мало памяти
            ),
        ),
        ("clf", LogisticRegression(max_iter=1000, n_jobs=-1, class_weight="balanced")),
    ]
)

pipeline_lang.fit(X_train, y_train)

print(classification_report(y_test, pipeline_lang.predict(X_test)))

              precision    recall  f1-score   support

          kk       0.96      0.94      0.95     11220
          ru       0.94      0.95      0.95      9999

    accuracy                           0.95     21219
   macro avg       0.95      0.95      0.95     21219
weighted avg       0.95      0.95      0.95     21219



In [10]:
import joblib

joblib.dump(pipeline_lang, "model/langid_ru_kk_model.pkl")

['model/langid_ru_kk_model.pkl']

In [None]:
import joblib

pipeline_lang = joblib.load("model/langid_ru_kk_model.pkl")

In [None]:
pipeline_lang.predict(["ф", "фыа"])

array(['ru', 'ru'], dtype=object)