# Initialization

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import optuna
import joblib

from typing import List
import string

In [23]:
RANDOM_STATE = 0
N_JOBS = 6

# Reading data

In [3]:
train_df = pd.read_csv("../../data/processed/train_language_detection_dataset.csv")
train_df.head()

Unnamed: 0,sentence,lan_code
0,"Merry Christmas, Tatoeba!",eng
1,"Крайне важно, чтобы мы поговорили с Томом.",rus
2,Она была весела.,rus
3,Urban sprawl is said to be a major contributor...,eng
4,Только не надо делать большие глаза.,rus


In [4]:
final_test_df = pd.read_csv("../../data/processed/test_language_detection_dataset.csv")
final_test_df.head()

Unnamed: 0,sentence,lan_code
0,She suspected that it was too late.,eng
1,Я порой бываю рассеян.,rus
2,Он мог бы победить.,rus
3,"Том мог быть не таким счастливым, каким прикид...",rus
4,"Do not forsake me, oh my darling.",eng


In [13]:
X_train = train_df["sentence"]
y_train = train_df["lan_code"]

In [14]:
X_final_test = final_test_df["sentence"]
y_final_test = final_test_df["lan_code"]

# Encoders

In this section we'll compare the learning speed of several different encoders for our texts as well as prepare different preprocessing approaches.

We should also note that all of our encoders are gonna tokenize text by characters and not by words for the reason that is we're trying to classify text and not to find some deep semantic meaning of it, for which separation by characters is a much better approach (it can handle typos and only "looks" for the character distribution by which language can be determined pretty accurately, but we still may be caring for characters order and for this reason we're gonna use custom n-grams parameter values).

## Preprocessors

In this section we're gonna write some simple different preprocessors.

In [10]:
def simple_preprocessor(text: str) -> str:
    return text

In [11]:
def no_punctuation_preprocessor(text: str) -> str:
    blacklist_chars = string.punctuation
    translation_table = str.maketrans("", "", blacklist_chars)
    cleaned_text = text.translate(translation_table)
    return cleaned_text

In [21]:
preprocessors = {
    "simple": simple_preprocessor,
    "no_punct": no_punctuation_preprocessor
}

## TF-IDF

scikin-learn's implementation of TF-IDF is offline, which means that it keeps everything in-memory and should be trained in one run. It may not be viable for training and tuning our models, but we still gonna check the perfomance of this approach as well.

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf = TfidfVectorizer(
    analyzer='char_wb', # We're caring only about characters in word bounds
    ngram_range=(1,3),
    max_features=100000,
    preprocessor=no_punctuation_preprocessor,
    lowercase=True,
)

Note that we're not using stopwords as they can be a crutial part in differentiating between languages from one language family (exactly Russian and Ukrainian).

In [15]:
tfidf.fit(X_train)

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,<function no_...0021314003BA0>
,tokenizer,
,analyzer,'char_wb'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


As we can see, this TF-IDF implementation has been training for a pretty long time considering that we'll be tuning hyperparameters of both encoder and our classification model, so we're not likely to use it considering that it is also an offline implementation.

## HashingVectorizer

HashingVectorizer is an online approach, which means that we can partially train it. Also, its scikit-learn implementation should be noticably faster than the implementation of TF-IDF (although it's resuls can be quite worse, but it's still faster to train which means that it's also faster to tune).

In [16]:
from sklearn.feature_extraction.text import HashingVectorizer

In [17]:
vectorizer = HashingVectorizer(
    analyzer='char_wb',
    ngram_range=(1,3),
    n_features=2**20,
    lowercase=True,
    preprocessor=no_punctuation_preprocessor
)

In [20]:
vectorizer.fit_transform(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 172011046 stored elements and shape (2273614, 1048576)>

As this vectorizer is online, it's fit is stateless, but it's whole expensive computation is performed during transform stage. In result we can see, that it's still faster than TF-IDF, so we'll use it during our hyperparameters tuning.

# Models

We'll try three different models - SGDClassifier, Multinomial NaiveBayes and XGBoost, all of which are pretty popular in text classification tasks.

## SGDClassifier

### Tuning

In [26]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [None]:
def sgdclassifier_objective(trial: optuna.Trial):
    # HashingVectorizer hyperparameters
    prep_name = trial.suggest_categorical("preprocessor", list(preprocessors.keys()))
    preprocessor = preprocessors[prep_name]
    n_features_pow = trial.suggest_int("n_features_pow", 12, 20)
    ngram_max = trial.suggest_int("ngram_max", 3, 5)

    vectorizer = HashingVectorizer(
        analyzer='char_wb',
        ngram_range=(1, ngram_max),
        n_features=2 ** n_features_pow,
        preprocessor=preprocessor,
        alternate_sign=True,
        norm='l2',
        lowercase=True
    )

    # SGDClassifier hyperparameters
    alpha = trial.suggest_float("alpha", 1e-6, 1e-2, log=True)
    penalty = trial.suggest_categorical("penalty", ["l2", "elasticnet"])

    # ElasticNet mixing parameter, will be ignored by model if penalty is l2
    l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)

    clf = SGDClassifier(
        loss='log_loss', # LogisticRegression
        penalty=penalty,
        alpha=alpha,
        l1_ratio=l1_ratio,
        class_weight="balanced", # Crutial because we have big class imbalance in our training dataset
        random_state=RANDOM_STATE,
        n_jobs=N_JOBS
    )

    pipeline = Pipeline([
        ("vectorizer", vectorizer),
        ("classifier", clf)
    ])

    scores = cross_val_score(
        pipeline,
        X_train,
        y_train,
        cv=3,
        scoring="f1_macro", # Because of our class imbalance
        n_jobs=N_JOBS
    )

    return scores.mean()

Note that we skipped max_iter parameter because it does not impact partial_fit behaviour according to https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html.

In [28]:
study = optuna.create_study(direction="maximize")
study.optimize(sgdclassifier_objective, n_trials=20, show_progress_bar=True)

[I 2026-01-04 15:26:07,774] A new study created in memory with name: no-name-0fb626e2-c2ea-4449-ac44-383fc1c84515


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-04 15:30:17,053] Trial 0 finished with value: 0.9854867955423026 and parameters: {'preprocessor': 'no_punct', 'n_features_pow': 14, 'ngram_max': 4, 'alpha': 8.298039111679069e-06, 'penalty': 'elasticnet', 'max_iter': 1697, 'l1_ratio': 0.1944574548372231}. Best is trial 0 with value: 0.9854867955423026.
[I 2026-01-04 15:34:40,770] Trial 1 finished with value: 0.9913250314610041 and parameters: {'preprocessor': 'simple', 'n_features_pow': 19, 'ngram_max': 4, 'alpha': 2.325802351912329e-06, 'penalty': 'elasticnet', 'max_iter': 1997, 'l1_ratio': 0.03514745774727046}. Best is trial 1 with value: 0.9913250314610041.
[I 2026-01-04 15:38:41,243] Trial 2 finished with value: 0.9773485365091737 and parameters: {'preprocessor': 'simple', 'n_features_pow': 12, 'ngram_max': 4, 'alpha': 2.325104182041563e-05, 'penalty': 'l2', 'max_iter': 687, 'l1_ratio': 0.8228819174630395}. Best is trial 1 with value: 0.9913250314610041.
[I 2026-01-04 15:43:41,091] Trial 3 finished with value: 0.96360402

In [29]:
study.best_params

{'preprocessor': 'simple',
 'n_features_pow': 20,
 'ngram_max': 5,
 'alpha': 1.0037269482365805e-06,
 'penalty': 'l2',
 'max_iter': 851,
 'l1_ratio': 0.3450976726144354}

### Saving model

Now we're gonna save our model using joblib in order to not go through hyperparameter tuning stage again in the future.

As our preprocessor parameter ended up being simple, we can skip the preprocessor parameter in our HashingVectorizer in final pipeline because it just returns a text itself.

In [34]:
sgd_model_path = "../../models/dev/sgd_language_detection_model.joblib"

In [32]:
params = study.best_params

final_sgd_vectorizer = HashingVectorizer(
    analyzer='char_wb',
    ngram_range=(1, params["ngram_max"]),
    n_features=2 ** params["n_features_pow"],
    alternate_sign=True,
    norm='l2',
    lowercase=True
)

final_sgd_clf = SGDClassifier(
    loss='log_loss',
    penalty=params["penalty"],
    alpha=params["alpha"],
    l1_ratio=params["l1_ratio"],
    class_weight="balanced",
    random_state=RANDOM_STATE,
    n_jobs=N_JOBS
)

final_sgd_pipeline = Pipeline([
    ("vectorizer", final_sgd_vectorizer),
    ("classifier", final_sgd_clf)
])

In [None]:
final_sgd_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('vectorizer', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,loss,'log_loss'
,penalty,'l2'
,alpha,1.0037269482365805e-06
,l1_ratio,0.3450976726144354
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [35]:
joblib.dump(final_sgd_pipeline, sgd_model_path)

['../../models/dev/sgd_language_detection_model.joblib']