# Initialization

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import optuna
import joblib

from typing import List
import string

In [2]:
RANDOM_STATE = 0
N_JOBS = 6

In [3]:
import sys
import os

current_folder = os.path.abspath('') 

notebooks_folder = os.path.dirname(current_folder)

project_root = os.path.dirname(notebooks_folder)

if project_root not in sys.path:
    sys.path.append(project_root)

# Reading data

In [4]:
train_df = pd.read_csv("data/processed/train_language_detection_dataset.csv")
train_df.head()

Unnamed: 0,sentence,lan_code
0,"Merry Christmas, Tatoeba!",eng
1,"Крайне важно, чтобы мы поговорили с Томом.",rus
2,Она была весела.,rus
3,Urban sprawl is said to be a major contributor...,eng
4,Только не надо делать большие глаза.,rus


In [5]:
final_test_df = pd.read_csv("data/processed/test_language_detection_dataset.csv")
final_test_df.head()

Unnamed: 0,sentence,lan_code
0,She suspected that it was too late.,eng
1,Я порой бываю рассеян.,rus
2,Он мог бы победить.,rus
3,"Том мог быть не таким счастливым, каким прикид...",rus
4,"Do not forsake me, oh my darling.",eng


In [6]:
X_train = train_df["sentence"]
y_train = train_df["lan_code"]

In [7]:
X_final_test = final_test_df["sentence"]
y_final_test = final_test_df["lan_code"]

# Encoders

In this section we'll compare the learning speed of several different encoders for our texts as well as prepare different preprocessing approaches.

We should also note that all of our encoders are gonna tokenize text by characters and not by words for the reason that is we're trying to classify text and not to find some deep semantic meaning of it, for which separation by characters is a much better approach (it can handle typos and only "looks" for the character distribution by which language can be determined pretty accurately, but we still may be caring for characters order and for this reason we're gonna use custom n-grams parameter values).

## Preprocessors

In this section we're gonna write some simple different preprocessors.

In [7]:
def simple_preprocessor(text: str) -> str:
    return text

In [8]:
def no_punctuation_preprocessor(text: str) -> str:
    blacklist_chars = string.punctuation
    translation_table = str.maketrans("", "", blacklist_chars)
    cleaned_text = text.translate(translation_table)
    return cleaned_text

In [9]:
preprocessors = {
    "simple": simple_preprocessor,
    "no_punct": no_punctuation_preprocessor
}

We're also gonna copy our preprocessor functions as a custom model to the file located at `../../app/models/transformers.py` in order to be able to use them from anywhere later without need to redefine them (and we also may need this for our final model to work properly).

## TF-IDF

scikin-learn's implementation of TF-IDF is offline, which means that it keeps everything in-memory and should be trained in one run. It may not be viable for training and tuning our models, but we still gonna check the perfomance of this approach as well.

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf = TfidfVectorizer(
    analyzer='char_wb', # We're caring only about characters in word bounds
    ngram_range=(1,3),
    max_features=100000,
    preprocessor=no_punctuation_preprocessor,
    lowercase=True,
)

Note that we're not using stopwords as they can be a crutial part in differentiating between languages from one language family (exactly Russian and Ukrainian).

In [15]:
tfidf.fit(X_train)

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,<function no_...0021314003BA0>
,tokenizer,
,analyzer,'char_wb'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


As we can see, this TF-IDF implementation has been training for a pretty long time considering that we'll be tuning hyperparameters of both encoder and our classification model, so we're not likely to use it considering that it is also an offline implementation.

## HashingVectorizer

HashingVectorizer is an online approach, which means that we can partially train it. Also, its scikit-learn implementation should be noticably faster than the implementation of TF-IDF (although it's resuls can be quite worse, but it's still faster to train which means that it's also faster to tune).

In [16]:
from sklearn.feature_extraction.text import HashingVectorizer

In [17]:
vectorizer = HashingVectorizer(
    analyzer='char_wb',
    ngram_range=(1,3),
    n_features=2**20,
    lowercase=True,
    preprocessor=no_punctuation_preprocessor
)

In [20]:
vectorizer.fit_transform(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 172011046 stored elements and shape (2273614, 1048576)>

As this vectorizer is online, it's fit is stateless, but it's whole expensive computation is performed during transform stage. In result we can see, that it's still faster than TF-IDF, so we'll use it during our hyperparameters tuning.

# Models

We'll try two different models - SGDClassifier and Multinomial NaiveBayes, both of which are pretty popular in text classification tasks.

We're not gonna try other popular models, such as XGBoost and LightGBM, in this specific case due to the fact that they are gonna consume a lot more memory (proportional to `amount of rows` * `amount of features`) and be much slower (time of execution is proportional to `amount of rows` * `amount of features` * `amount of trees (weak learners)`) than our two models of choice.

In [14]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [9]:
from app.models.transformers import TextCleaner
from app.models.language_detector import LanguageDetector

## SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier

In [11]:
sgd_model_path = "models/dev/sgd_language_detection_model.joblib"

### Tuning

In [12]:
def sgdclassifier_objective(trial: optuna.Trial):
    # TextCleaner hyperparameters
    text_cleaner_mode = trial.suggest_categorical("text_cleaner_mode", ["simple", "no_punct"])

    text_cleaner = TextCleaner(
        mode=text_cleaner_mode
    )

    # HashingVectorizer hyperparameters
    n_features_pow = trial.suggest_int("n_features_pow", 12, 20)
    ngram_max = trial.suggest_int("ngram_max", 3, 5)

    vectorizer = HashingVectorizer(
        analyzer='char_wb',
        ngram_range=(1, ngram_max),
        n_features=2 ** n_features_pow,
        alternate_sign=True,
        norm='l2',
        lowercase=True
    )

    # SGDClassifier hyperparameters
    alpha = trial.suggest_float("alpha", 1e-6, 1e-2, log=True)
    penalty = trial.suggest_categorical("penalty", ["l2", "elasticnet"])

    # ElasticNet mixing parameter, will be ignored by model if penalty is l2
    l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)

    clf = SGDClassifier(
        loss='log_loss', # LogisticRegression
        penalty=penalty,
        alpha=alpha,
        l1_ratio=l1_ratio,
        class_weight="balanced", # Crutial because we have big class imbalance in our training dataset
        random_state=RANDOM_STATE,
        n_jobs=N_JOBS
    )

    pipeline = Pipeline([
        ("text_cleaner", text_cleaner),
        ("vectorizer", vectorizer),
        ("classifier", clf)
    ])

    language_detector = LanguageDetector(pipeline)

    scores = cross_val_score(
        language_detector,
        X_train,
        y_train,
        cv=3,
        scoring="f1_macro", # Because of our class imbalance
        n_jobs=N_JOBS
    )

    return scores.mean()

Note that we skipped max_iter parameter because it does not impact partial_fit behaviour according to https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html.

In [13]:
study = optuna.create_study(direction="maximize")
study.optimize(sgdclassifier_objective, n_trials=20, show_progress_bar=True)

[I 2026-01-07 15:30:52,879] A new study created in memory with name: no-name-0c194e39-7133-4951-a66b-f6d0b6faacc1


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-07 15:35:45,680] Trial 0 finished with value: 0.6416253465489841 and parameters: {'text_cleaner_mode': 'no_punct', 'n_features_pow': 13, 'ngram_max': 5, 'alpha': 0.007111215148975124, 'penalty': 'elasticnet', 'l1_ratio': 0.048867313591985484}. Best is trial 0 with value: 0.6416253465489841.
[I 2026-01-07 15:40:00,769] Trial 1 finished with value: 0.9500587582522858 and parameters: {'text_cleaner_mode': 'no_punct', 'n_features_pow': 13, 'ngram_max': 4, 'alpha': 0.00022691657594676922, 'penalty': 'elasticnet', 'l1_ratio': 0.8735592215668877}. Best is trial 1 with value: 0.9500587582522858.
[I 2026-01-07 15:44:17,065] Trial 2 finished with value: 0.9760172285568324 and parameters: {'text_cleaner_mode': 'no_punct', 'n_features_pow': 20, 'ngram_max': 5, 'alpha': 3.8176992409193474e-05, 'penalty': 'elasticnet', 'l1_ratio': 0.6088666619775157}. Best is trial 2 with value: 0.9760172285568324.
[I 2026-01-07 15:47:53,714] Trial 3 finished with value: 0.9638567396190442 and parameters:

In [14]:
study.best_params

{'text_cleaner_mode': 'simple',
 'n_features_pow': 15,
 'ngram_max': 4,
 'alpha': 1.011712448172737e-06,
 'penalty': 'l2',
 'l1_ratio': 0.373624350566634}

```
{'text_cleaner_mode': 'simple',
 'n_features_pow': 15,
 'ngram_max': 4,
 'alpha': 1.011712448172737e-06,
 'penalty': 'l2',
 'l1_ratio': 0.373624350566634}
```

### Saving model

We'll save best parameters into a separate variable in order to not need to go through hyperparameter tuning once again to get them.

In [15]:
sgd_best_params = {
    'text_cleaner_mode': 'simple',
    'n_features_pow': 15,
    'ngram_max': 4,
    'alpha': 1.011712448172737e-06,
    'penalty': 'l2',
    'l1_ratio': 0.373624350566634
}

Now we're gonna save our model using joblib in order to not go through hyperparameter tuning stage again in the future.

As our text cleaner parameter ended up being `simple`, our model will lead best perfomance without additional preprocessing applied on text.

In [18]:
try:
    xgb_params = study.best_params
except:
    xgb_params = sgd_best_params

final_sgd_vectorizer = HashingVectorizer(
    analyzer='char_wb',
    ngram_range=(1, xgb_params["ngram_max"]),
    n_features=2 ** xgb_params["n_features_pow"],
    alternate_sign=True,
    norm='l2',
    lowercase=True
)

final_sgd_clf = SGDClassifier(
    loss='log_loss',
    penalty=xgb_params["penalty"],
    alpha=xgb_params["alpha"],
    l1_ratio=xgb_params["l1_ratio"],
    class_weight="balanced",
    random_state=RANDOM_STATE,
    n_jobs=N_JOBS
)

final_sgd_pipeline = Pipeline([
    ("vectorizer", final_sgd_vectorizer),
    ("classifier", final_sgd_clf)
])

sgd_language_detector = LanguageDetector(final_sgd_pipeline)

In [19]:
sgd_language_detector.fit(X_train, y_train)

0,1,2
,clf,Pipeline(step...om_state=0))])

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,loss,'log_loss'
,penalty,'l2'
,alpha,1.011712448172737e-06
,l1_ratio,0.373624350566634
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [20]:
joblib.dump(sgd_language_detector, sgd_model_path)

['models/dev/sgd_language_detection_model.joblib']

### Loading model

And now we can load our already tuned model and see it's structure.

In [21]:
sgd_model = joblib.load(sgd_model_path)
sgd_model

0,1,2
,clf,Pipeline(step...om_state=0))])

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,loss,'log_loss'
,penalty,'l2'
,alpha,1.011712448172737e-06
,l1_ratio,0.373624350566634
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


As we've stated earlier, we can use our model as-is - no additional text preprocessing needed!

## MultinomialNB

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [31]:
mnb_model_path = "models/dev/mnb_language_detection_model.joblib"

### Tuning

In [32]:
def mnbclassifier_objective(trial: optuna.Trial):
    # TextCleaner hyperparameters
    text_cleaner_mode = trial.suggest_categorical("text_cleaner_mode", ["simple", "no_punct"])

    text_cleaner = TextCleaner(
        mode=text_cleaner_mode
    )

    # HashingVectorizer hyperparameters
    n_features_pow = trial.suggest_int("n_features_pow", 12, 20)
    ngram_max = trial.suggest_int("ngram_max", 3, 5)
    vectorizer_norm = trial.suggest_categorical("vectorizer_norm", ["l2", None])

    vectorizer = HashingVectorizer(
        analyzer='char_wb',
        ngram_range=(1, ngram_max),
        n_features=2 ** n_features_pow,
        alternate_sign=False, # Values must be non-negative for MultinomialNB
        norm=vectorizer_norm,
        lowercase=True
    )

    # MultinomialNB hyperparameters
    alpha = trial.suggest_float("alpha", 1e-10, 10.0, log=True)
    fit_prior = trial.suggest_categorical("fit_prior", [True, False])

    clf = MultinomialNB(
        alpha=alpha,
        fit_prior=fit_prior
    )

    pipeline = Pipeline([
        ("text_cleaner", text_cleaner),
        ("vectorizer", vectorizer),
        ("classifier", clf)
    ])

    language_detector = LanguageDetector(pipeline)

    scores = cross_val_score(
        language_detector,
        X_train,
        y_train,
        cv=3,
        scoring="f1_macro", # Because of our class imbalance
        n_jobs=N_JOBS
    )

    return scores.mean()

In [34]:
study = optuna.create_study(direction="maximize")
study.optimize(mnbclassifier_objective, n_trials=20, show_progress_bar=True)

[I 2026-01-07 18:19:34,695] A new study created in memory with name: no-name-4010187c-4e22-43cc-b0dc-a108707b9980


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-07 18:21:45,203] Trial 0 finished with value: 0.9891935103412478 and parameters: {'text_cleaner_mode': 'no_punct', 'n_features_pow': 20, 'ngram_max': 3, 'vectorizer_norm': 'l2', 'alpha': 9.458303809457318e-06, 'fit_prior': False}. Best is trial 0 with value: 0.9891935103412478.
[I 2026-01-07 18:24:33,131] Trial 1 finished with value: 0.9924869249683289 and parameters: {'text_cleaner_mode': 'simple', 'n_features_pow': 18, 'ngram_max': 4, 'vectorizer_norm': 'l2', 'alpha': 9.08413893385699e-06, 'fit_prior': True}. Best is trial 1 with value: 0.9924869249683289.
[I 2026-01-07 18:27:27,950] Trial 2 finished with value: 0.9887192434414064 and parameters: {'text_cleaner_mode': 'no_punct', 'n_features_pow': 14, 'ngram_max': 3, 'vectorizer_norm': None, 'alpha': 0.028403104290041604, 'fit_prior': True}. Best is trial 1 with value: 0.9924869249683289.
[I 2026-01-07 18:30:35,298] Trial 3 finished with value: 0.9896048877107709 and parameters: {'text_cleaner_mode': 'simple', 'n_features_

In [35]:
study.best_params

{'text_cleaner_mode': 'simple',
 'n_features_pow': 19,
 'ngram_max': 5,
 'vectorizer_norm': None,
 'alpha': 3.8254180916274174e-07,
 'fit_prior': True}

```
{'text_cleaner_mode': 'simple',
 'n_features_pow': 19,
 'ngram_max': 5,
 'vectorizer_norm': None,
 'alpha': 3.8254180916274174e-07,
 'fit_prior': True}
 ```

### Saving model

We'll save best parameters to a separate variable in order to not need to go through hyperparameter tuning once again to get them.

In [36]:
mnb_best_params = {
    'text_cleaner_mode': 'simple',
    'n_features_pow': 19,
    'ngram_max': 5,
    'vectorizer_norm': None,
    'alpha': 3.8254180916274174e-07,
    'fit_prior': True
}

Now we're gonna save our model using joblib in order to not go through hyperparameter tuning stage again in the future.

As our text cleaner parameter ended up being `simple`, our model will lead best perfomance without additional preprocessing applied on text.

In [37]:
try:
    xgb_params = study.best_params
except:
    xgb_params = mnb_best_params

final_mnb_vectorizer = HashingVectorizer(
    analyzer='char_wb',
    ngram_range=(1, xgb_params["ngram_max"]),
    n_features=2 ** xgb_params["n_features_pow"],
    alternate_sign=False,
    norm=xgb_params["vectorizer_norm"],
    lowercase=True
)

final_mnb_clf = MultinomialNB(
    alpha=xgb_params["alpha"],
    fit_prior=xgb_params["fit_prior"]
)

final_mnb_pipeline = Pipeline([
    ("vectorizer", final_mnb_vectorizer),
    ("classifier", final_mnb_clf)
])

mnb_language_detector = LanguageDetector(final_mnb_pipeline)

In [38]:
mnb_language_detector.fit(X_train, y_train)

0,1,2
,clf,Pipeline(step...274174e-07))])

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,alpha,3.8254180916274174e-07
,force_alpha,True
,fit_prior,True
,class_prior,


In [39]:
joblib.dump(mnb_language_detector, mnb_model_path)

['models/dev/mnb_language_detection_model.joblib']

### Loading model

And now we can load our already tuned model and see it's structure.

In [40]:
mnb_model = joblib.load(mnb_model_path)
mnb_model

0,1,2
,clf,Pipeline(step...274174e-07))])

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,alpha,3.8254180916274174e-07
,force_alpha,True
,fit_prior,True
,class_prior,


As we've stated earlier, we can use our model as-is - no additional text preprocessing needed!