# Initialization

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import optuna
import joblib

from typing import List
import string

In [2]:
RANDOM_STATE = 0
N_JOBS = 6

# Reading data

In [3]:
train_df = pd.read_csv("data/processed/train_english_spam_detection_dataset.csv")
train_df.head()

Unnamed: 0,message,spam
0,Had your contract mobile 11 Mnths? Latest Moto...,1
1,the lottery promotion company limited churchil...,1
2,would you re - flnance if you knew you ' d sav...,1
3,"please see attached , clean and redline versio...",0
4,ilug wilson kamela attn sir madan strictly con...,1


In [4]:
X_train = train_df["message"]
y_train = train_df["spam"]

# Encoders

In this section we'll compare the learning speed of several different encoders for our texts as well as prepare different preprocessing approaches.

We should also note that all of our encoders are gonna tokenize text by characters and not by words for the reason that is we're trying to classify text and not to find some deep semantic meaning of it, for which separation by characters is a much better approach (it can handle typos and only "looks" for the character distribution by which language can be determined pretty accurately, but we still may be caring for characters order and for this reason we're gonna use custom n-grams parameter values).

## Preprocessors

In this section we're gonna write some simple different preprocessors.

In [6]:
def simple_preprocessor(text: str) -> str:
    return text

In [7]:
def no_punctuation_preprocessor(text: str) -> str:
    blacklist_chars = string.punctuation
    translation_table = str.maketrans("", "", blacklist_chars)
    cleaned_text = text.translate(translation_table)
    return cleaned_text

In [8]:
preprocessors = {
    "simple": simple_preprocessor,
    "no_punct": no_punctuation_preprocessor
}

We're also gonna copy our preprocessor functions as a custom model to the file located at `../../app/models/transformers.py` in order to be able to use them from anywhere later without need to redefine them (and we also may need this for our final model to work properly).

## TF-IDF

scikin-learn's implementation of TF-IDF is offline, which means that it keeps everything in-memory and should be trained in one run. It may not be viable for training and tuning our models, but we still gonna check the perfomance of this approach as well.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
tfidf = TfidfVectorizer(
    analyzer='char_wb', # We're caring only about characters in word bounds
    ngram_range=(1,3),
    max_features=100000,
    preprocessor=no_punctuation_preprocessor,
    lowercase=True,
)

Note that we're not using stopwords as they can be a crutial part in differentiating between languages from one language family (exactly Russian and Ukrainian).

In [11]:
tfidf.fit(X_train)

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,<function no_...0013992D42980>
,tokenizer,
,analyzer,'char_wb'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


As we can see, this TF-IDF implementation has been training for a pretty long time considering that we'll be tuning hyperparameters of both encoder and our classification model, so we're not likely to use it considering that it is also an offline implementation.

## HashingVectorizer

HashingVectorizer is an online approach, which means that we can partially train it. Also, its scikit-learn implementation should be noticably faster than the implementation of TF-IDF (although it's resuls can be quite worse, but it's still faster to train which means that it's also faster to tune).

In [12]:
from sklearn.feature_extraction.text import HashingVectorizer

In [13]:
vectorizer = HashingVectorizer(
    analyzer='char_wb',
    ngram_range=(1,3),
    n_features=2**20,
    lowercase=True,
    preprocessor=no_punctuation_preprocessor
)

In [14]:
vectorizer.fit_transform(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 21098433 stored elements and shape (44963, 1048576)>

As this vectorizer is online, it's fit is stateless, but it's whole expensive computation is performed during transform stage. In result we can see, that it's still faster than TF-IDF, so we'll use it during our hyperparameters tuning.

# Models

We'll try two different models - SGDClassifier and Multinomial NaiveBayes, both of which are pretty popular in text classification tasks.

We're not gonna try other popular models, such as XGBoost and LightGBM, in this specific case due to the fact that they are gonna consume a lot more memory (proportional to `amount of rows` * `amount of features`) and be much slower (time of execution is proportional to `amount of rows` * `amount of features` * `amount of trees (weak learners)`) than our two models of choice.

In [5]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import cross_val_score

In [6]:
from utils.models.transformers import TextCleaner
from utils.models.language_detector import LanguageDetector

## SGDClassifier

In [7]:
from sklearn.linear_model import SGDClassifier

In [8]:
sgd_model_path = "models/dev/sgd_english_spam_detection_model.joblib"

### Tuning

In [None]:
def sgdclassifier_objective(trial: optuna.Trial):
    # Word ngrams
    word_n_features_pow = trial.suggest_int("word_n_features_pow", 16, 21)
    word_ngram_max = trial.suggest_int("word_ngram_max", 1, 3)

    word_vectorizer = HashingVectorizer(
        analyzer='word',
        ngram_range=(1, word_ngram_max),
        n_features=2 ** word_n_features_pow,
        alternate_sign=True,
        norm='l2',
        lowercase=True
    )

    # Characters inside words ngrams
    char_n_features_pow = trial.suggest_int("char_n_features_pow", 14, 18)
    char_ngram_min = trial.suggest_int("char_ngram_min", 2, 3)
    char_ngram_max = trial.suggest_int("char_ngram_max", 4, 6)

    char_vectorizer = HashingVectorizer(
        analyzer='char_wb',
        ngram_range=(char_ngram_min, char_ngram_max),
        n_features=2 ** char_n_features_pow,
        alternate_sign=True,
        norm='l2',
        lowercase=True
    )

    # Combined features vectorizer
    combined_features_vectorizer = FeatureUnion([
        ("word_features", word_vectorizer),
        ("char_features", char_vectorizer),
    ])

    # SGDClassifier hyperparameters
    alpha = trial.suggest_float("alpha", 1e-6, 1e-1, log=True)
    penalty = trial.suggest_categorical("penalty", ["l2", "elasticnet"])

    # ElasticNet mixing parameter, will be ignored by model if penalty is l2
    l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)

    clf = SGDClassifier(
        loss='log_loss', # LogisticRegression
        penalty=penalty,
        alpha=alpha,
        l1_ratio=l1_ratio,
        class_weight="balanced", # Crutial because we have big class imbalance in our training dataset
        random_state=RANDOM_STATE,
        n_jobs=N_JOBS
    )

    pipeline = Pipeline([
        ("features_vectorizer", combined_features_vectorizer),
        ("classifier", clf)
    ])

    language_detector = LanguageDetector(pipeline)

    scores = cross_val_score(
        language_detector,
        X_train,
        y_train,
        cv=3,
        scoring="f1_macro", # Because of our class imbalance
        n_jobs=N_JOBS
    )

    return scores.mean()

Note that we skipped max_iter parameter because it does not impact partial_fit behaviour according to https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html.

In [10]:
study = optuna.create_study(direction="maximize")
study.optimize(sgdclassifier_objective, n_trials=20, show_progress_bar=True)

[I 2026-02-16 15:51:48,638] A new study created in memory with name: no-name-935bc1fb-496e-4a99-aa56-c3bf56fd0ef1


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-02-16 15:53:19,607] Trial 0 finished with value: 0.8370967264158434 and parameters: {'word_n_features_pow': 18, 'word_ngram_max': 1, 'char_n_features_pow': 17, 'char_ngram_min': 2, 'char_ngram_max': 5, 'alpha': 0.0015344770286216357, 'penalty': 'elasticnet', 'l1_ratio': 0.36589857424180516}. Best is trial 0 with value: 0.8370967264158434.
[I 2026-02-16 15:54:54,260] Trial 1 finished with value: 0.9333884714071422 and parameters: {'word_n_features_pow': 19, 'word_ngram_max': 3, 'char_n_features_pow': 16, 'char_ngram_min': 2, 'char_ngram_max': 6, 'alpha': 0.0001869490342689748, 'penalty': 'l2', 'l1_ratio': 0.6652428939196582}. Best is trial 1 with value: 0.9333884714071422.
[I 2026-02-16 15:56:05,196] Trial 2 finished with value: 0.8393391365538792 and parameters: {'word_n_features_pow': 19, 'word_ngram_max': 1, 'char_n_features_pow': 18, 'char_ngram_min': 2, 'char_ngram_max': 5, 'alpha': 0.009229902587161394, 'penalty': 'l2', 'l1_ratio': 0.6672352003506664}. Best is trial 1 with

In [11]:
study.best_params

{'word_n_features_pow': 16,
 'word_ngram_max': 2,
 'char_n_features_pow': 16,
 'char_ngram_min': 3,
 'char_ngram_max': 4,
 'alpha': 1.2107742997998623e-06,
 'penalty': 'elasticnet',
 'l1_ratio': 0.017198734260344205}

```
{'word_n_features_pow': 16,
 'word_ngram_max': 2,
 'char_n_features_pow': 16,
 'char_ngram_min': 3,
 'char_ngram_max': 4,
 'alpha': 1.2107742997998623e-06,
 'penalty': 'elasticnet',
 'l1_ratio': 0.017198734260344205}
```

### Saving model

We'll save best parameters into a separate variable in order to not need to go through hyperparameter tuning once again to get them.

In [12]:
sgd_best_params = {
    'word_n_features_pow': 16,
    'word_ngram_max': 2,
    'char_n_features_pow': 16,
    'char_ngram_min': 3,
    'char_ngram_max': 4,
    'alpha': 1.2107742997998623e-06,
    'penalty': 'elasticnet',
    'l1_ratio': 0.017198734260344205
}

Now we're gonna save our model using joblib in order to not go through hyperparameter tuning stage again in the future.

As our text cleaner parameter ended up being `simple`, our model will lead best perfomance without additional preprocessing applied on text.

In [13]:
try:
    sgd_params = study.best_params
except:
    sgd_params = sgd_best_params 

word_vectorizer = HashingVectorizer(
    analyzer='word',
    ngram_range=(1, sgd_params["word_ngram_max"]),
    n_features=2 ** sgd_params["word_n_features_pow"],
    alternate_sign=True,
    norm='l2',
    lowercase=True
)

char_vectorizer = HashingVectorizer(
    analyzer='char_wb',
    ngram_range=(sgd_params["char_ngram_min"], sgd_params["char_ngram_max"]),
    n_features=2 ** sgd_params["char_n_features_pow"],
    alternate_sign=True,
    norm='l2',
    lowercase=True
)

combined_features_vectorizer = FeatureUnion([
    ("word_features", word_vectorizer),
    ("char_features", char_vectorizer),
])

final_sgd_clf = SGDClassifier(
    loss='log_loss',
    penalty=sgd_params["penalty"],
    alpha=sgd_params["alpha"],
    l1_ratio=sgd_params["l1_ratio"],
    class_weight="balanced",
    random_state=RANDOM_STATE,
    n_jobs=N_JOBS
)

sgd_spam_detector = Pipeline([
    ("features_vectorizer", combined_features_vectorizer),
    ("classifier", final_sgd_clf)
])

In [14]:
sgd_spam_detector.fit(X_train, y_train)

0,1,2
,steps,"[('features_vectorizer', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformer_list,"[('word_features', ...), ('char_features', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(3, ...)"

0,1,2
,loss,'log_loss'
,penalty,'elasticnet'
,alpha,1.2107742997998623e-06
,l1_ratio,0.017198734260344205
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [15]:
joblib.dump(sgd_spam_detector, sgd_model_path)

['models/dev/sgd_english_spam_detection_model.joblib']

### Loading model

And now we can load our already tuned model and see it's structure.

In [16]:
sgd_model = joblib.load(sgd_model_path)
sgd_model

0,1,2
,steps,"[('features_vectorizer', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformer_list,"[('word_features', ...), ('char_features', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(3, ...)"

0,1,2
,loss,'log_loss'
,penalty,'elasticnet'
,alpha,1.2107742997998623e-06
,l1_ratio,0.017198734260344205
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


As we've stated earlier, we can use our model as-is - no additional text preprocessing needed!

## MultinomialNB

In [17]:
from sklearn.naive_bayes import MultinomialNB

In [18]:
mnb_model_path = "models/dev/mnb_english_spam_detection_model.joblib"

### Tuning

In [19]:
def mnbclassifier_objective(trial: optuna.Trial):
    # Word ngrams
    word_n_features_pow = trial.suggest_int("word_n_features_pow", 16, 21)
    word_ngram_max = trial.suggest_int("word_ngram_max", 1, 3)

    word_vectorizer = HashingVectorizer(
        analyzer='word',
        ngram_range=(1, word_ngram_max),
        n_features=2 ** word_n_features_pow,
        alternate_sign=False, # Values must be non-negative for MultinomialNB
        norm='l2',
        lowercase=True
    )

    # Characters inside words ngrams
    char_n_features_pow = trial.suggest_int("char_n_features_pow", 14, 18)
    char_ngram_min = trial.suggest_int("char_ngram_min", 2, 3)
    char_ngram_max = trial.suggest_int("char_ngram_max", 4, 6)

    char_vectorizer = HashingVectorizer(
        analyzer='char_wb',
        ngram_range=(char_ngram_min, char_ngram_max),
        n_features=2 ** char_n_features_pow,
        alternate_sign=False, # Values must be non-negative for MultinomialNB
        norm='l2',
        lowercase=True
    )

    # Combined features vectorizer
    combined_features_vectorizer = FeatureUnion([
        ("word_features", word_vectorizer),
        ("char_features", char_vectorizer),
    ])

    # MultinomialNB hyperparameters
    alpha = trial.suggest_float("alpha", 1e-10, 10.0, log=True)
    fit_prior = trial.suggest_categorical("fit_prior", [True, False])

    clf = MultinomialNB(
        alpha=alpha,
        fit_prior=fit_prior
    )

    pipeline = Pipeline([
        ("features_vectorizer", combined_features_vectorizer),
        ("classifier", clf)
    ])

    language_detector = LanguageDetector(pipeline)

    scores = cross_val_score(
        language_detector,
        X_train,
        y_train,
        cv=3,
        scoring="f1_macro", # Because of our class imbalance
        n_jobs=N_JOBS
    )

    return scores.mean()

In [20]:
study = optuna.create_study(direction="maximize")
study.optimize(mnbclassifier_objective, n_trials=20, show_progress_bar=True)

[I 2026-02-16 16:20:11,474] A new study created in memory with name: no-name-50270047-24cd-4ae4-b217-565fde742d61


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-02-16 16:21:25,065] Trial 0 finished with value: 0.9414267657535174 and parameters: {'word_n_features_pow': 19, 'word_ngram_max': 1, 'char_n_features_pow': 15, 'char_ngram_min': 2, 'char_ngram_max': 4, 'alpha': 7.083412315401423e-10, 'fit_prior': False}. Best is trial 0 with value: 0.9414267657535174.
[I 2026-02-16 16:22:09,898] Trial 1 finished with value: 0.9441792333128145 and parameters: {'word_n_features_pow': 19, 'word_ngram_max': 1, 'char_n_features_pow': 16, 'char_ngram_min': 3, 'char_ngram_max': 5, 'alpha': 2.6562194852236175e-06, 'fit_prior': True}. Best is trial 1 with value: 0.9441792333128145.
[I 2026-02-16 16:23:11,120] Trial 2 finished with value: 0.934894117004097 and parameters: {'word_n_features_pow': 18, 'word_ngram_max': 2, 'char_n_features_pow': 16, 'char_ngram_min': 3, 'char_ngram_max': 5, 'alpha': 0.3620637030737649, 'fit_prior': False}. Best is trial 1 with value: 0.9441792333128145.
[I 2026-02-16 16:24:25,095] Trial 3 finished with value: 0.930523336364

In [21]:
study.best_params

{'word_n_features_pow': 21,
 'word_ngram_max': 3,
 'char_n_features_pow': 17,
 'char_ngram_min': 2,
 'char_ngram_max': 4,
 'alpha': 0.01038499332509461,
 'fit_prior': False}

```
{'word_n_features_pow': 21,
 'word_ngram_max': 3,
 'char_n_features_pow': 17,
 'char_ngram_min': 2,
 'char_ngram_max': 4,
 'alpha': 0.01038499332509461,
 'fit_prior': False}
 ```

### Saving model

We'll save best parameters to a separate variable in order to not need to go through hyperparameter tuning once again to get them.

In [22]:
mnb_best_params = {
    'word_n_features_pow': 21,
    'word_ngram_max': 3,
    'char_n_features_pow': 17,
    'char_ngram_min': 2,
    'char_ngram_max': 4,
    'alpha': 0.01038499332509461,
    'fit_prior': False
}

Now we're gonna save our model using joblib in order to not go through hyperparameter tuning stage again in the future.

As our text cleaner parameter ended up being `simple`, our model will lead best perfomance without additional preprocessing applied on text.

In [25]:
try:
    mnb_params = study.best_params
except:
    mnb_params = mnb_best_params 

word_vectorizer = HashingVectorizer(
    analyzer='word',
    ngram_range=(1, mnb_params["word_ngram_max"]),
    n_features=2 ** mnb_params["word_n_features_pow"],
    alternate_sign=False,
    norm='l2',
    lowercase=True
)

char_vectorizer = HashingVectorizer(
    analyzer='char_wb',
    ngram_range=(mnb_params["char_ngram_min"], mnb_params["char_ngram_max"]),
    n_features=2 ** mnb_params["char_n_features_pow"],
    alternate_sign=False,
    norm='l2',
    lowercase=True
)

combined_features_vectorizer = FeatureUnion([
    ("word_features", word_vectorizer),
    ("char_features", char_vectorizer),
])

final_mnb_clf = MultinomialNB(
    alpha=mnb_params["alpha"],
    fit_prior=mnb_params["fit_prior"]
)

mnb_spam_detector = Pipeline([
    ("features_vectorizer", combined_features_vectorizer),
    ("classifier", final_mnb_clf)
])

In [26]:
mnb_spam_detector.fit(X_train, y_train)

0,1,2
,steps,"[('features_vectorizer', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformer_list,"[('word_features', ...), ('char_features', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(2, ...)"

0,1,2
,alpha,0.01038499332509461
,force_alpha,True
,fit_prior,False
,class_prior,


In [27]:
joblib.dump(mnb_spam_detector, mnb_model_path)

['models/dev/mnb_english_spam_detection_model.joblib']

### Loading model

And now we can load our already tuned model and see it's structure.

In [28]:
mnb_model = joblib.load(mnb_model_path)
mnb_model

0,1,2
,steps,"[('features_vectorizer', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformer_list,"[('word_features', ...), ('char_features', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(2, ...)"

0,1,2
,alpha,0.01038499332509461
,force_alpha,True
,fit_prior,False
,class_prior,


As we've stated earlier, we can use our model as-is - no additional text preprocessing needed!