# Classification on text features

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

Примеры пайплайнов экстракции признаков

### Обучение

* Изменения в первом этапе, внесенные после защиты:
  * гиперпараметры logreg: C: `[.01, .1, .2, .5, .8, 1., 3, 5, 10]`
  * logreg все еще только с l2 реуляризацией, так как иначе требуется отличный от liblinear солвер, который работает на несколько порядков дольше
  * добавлена модель svm. гиперпараметры: C: `[.1, .5, 1, 3, 5]`, penalty: `['l1', 'l2']`
  * ГП для XGBoost, RandomForest: max_depth: `[3, 5, 10]`, n_estimators: `[500, 1000, 2000]`
  * CatBoost обучается на порядок дольше чем xgb/rf, поэтому обучение не проводилось на нем. GPU версия не работает с разреженными матрицами, вместо этого конвертирует их в dense формат, что не умещается в память

### Результаты

Ниже преведена таблица с результатами. Красным цветом выделен максимум в каждом столбике. Таблица отсортирована по убыванию f1 на тестовом множестве. Из нее можно сделать сделующие выводы:
* Хеширование n-грам в среднем дает лучшие результаты, потому что оно вообще не выбрасывает n-граммы, а использует все (возможно с коллизиями)
* BPE токенизация дала очень сильный буст качества работы алгоритма - все модели с bpe строго лучше остальных (исключение - random forest)
* Ожидаемо, лучшей моделью в среднем оказался xgboost, от него не сильно отстает logreg, random forest оказался хуже
* В принципе лучшей моделью оказалась связка BPE + TfIdf + LogReg, она дала 0.724 на тестовом множестве

Все вычисления производились на сервере с 8-ядерным процессором Ryzen 7 3700X @ 3.6GHz, и заняли ~60 часов

In [5]:
results_df = pd.read_csv('training_results.csv')
results_df_svm = pd.read_csv('training_results_svm.csv')
results_df_lr = pd.read_csv('training_results_slr.csv')
results_df = pd.concat([results_df, results_df_svm, results_df_lr])

In [6]:
results_df = results_df.set_index(['model', 'tokenizer', 'postprocessor', 'vectorizer'])
results_df = results_df.sort_values('test_f1', ascending=False)
def highlight_max(s):
    is_max = s == s.max()
    return ['color: red' if v else '' for v in is_max]
results_df.style.apply(highlight_max)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,train_f1,test_f1,train_acc,test_acc,train_precision,test_precision,train_recall,test_recall,train_roc_auc,test_roc_auc,best_params
model,tokenizer,postprocessor,vectorizer,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
xgb,bpe,none,tfidf,0.858855,0.724136,0.864351,0.73453,0.894663,0.755038,0.825803,0.695664,0.864333,0.734597,"{'max_depth': 10, 'n_estimators': 2000}"
xgb,bpe,none,bow,0.839435,0.722943,0.84703,0.735718,0.882831,0.761094,0.800105,0.688435,0.847008,0.735798,"{'max_depth': 10, 'n_estimators': 2000}"
svm,bpe,none,tfidf,0.7434,0.722357,0.75262,0.73181,0.771698,0.750466,0.717104,0.696277,0.752599,0.731886,"{'C': 0.1, 'penalty': 'l2'}"
logreg,bpe,none,tfidf,0.745898,0.721782,0.753774,0.729885,0.770157,0.745456,0.72312,0.699565,0.753759,0.729937,{'C': 1.0}
logreg,bpe,none,bow,0.745976,0.721509,0.756719,0.732527,0.780038,0.753904,0.714764,0.691783,0.7567,0.732597,{'C': 0.1}
sparse_logreg,bpe,none,tfidf,0.742876,0.719346,0.751914,0.729628,0.77068,0.748739,0.717008,0.692175,0.751902,0.729673,{'C': 1.0}
svm,bpe,none,bow,0.742654,0.718315,0.755767,0.732314,0.784285,0.759753,0.705221,0.681163,0.755738,0.732424,"{'C': 0.1, 'penalty': 'l1'}"
sparse_logreg,bpe,none,bow,0.747677,0.718295,0.758273,0.730538,0.781662,0.753461,0.716523,0.686265,0.758259,0.730591,{'C': 0.5}
xgb,bpe,none,hashing,0.875785,0.718272,0.879982,0.728129,0.90706,0.746664,0.846594,0.69196,0.879966,0.728191,"{'max_depth': 10, 'n_estimators': 2000}"
logreg,bpe,none,hashing,0.732077,0.704247,0.739787,0.711889,0.754052,0.724742,0.711347,0.684879,0.739773,0.711936,{'C': 0.8}


In [7]:
results_df.groupby('vectorizer').mean().sort_values('test_f1', ascending=False)

Unnamed: 0_level_0,train_f1,test_f1,train_acc,test_acc,train_precision,test_precision,train_recall,test_recall,train_roc_auc,test_roc_auc
vectorizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tfidf,0.712078,0.680509,0.73261,0.702024,0.764545,0.731671,0.669656,0.639148,0.73258,0.702131
bow,0.703026,0.67739,0.728053,0.703266,0.766409,0.739568,0.653305,0.628645,0.728018,0.703393
hashing,0.718173,0.673929,0.735708,0.692823,0.764048,0.718212,0.680134,0.637216,0.735682,0.692917
glove_emb,0.63343,0.621858,0.628156,0.616098,0.624925,0.613976,0.642268,0.630027,0.628163,0.616074


In [8]:
results_df.groupby('postprocessor').mean().sort_values('test_f1', ascending=False)

Unnamed: 0_level_0,train_f1,test_f1,train_acc,test_acc,train_precision,test_precision,train_recall,test_recall,train_roc_auc,test_roc_auc
postprocessor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
none,0.742533,0.698972,0.758486,0.716106,0.788203,0.742869,0.704023,0.661867,0.75846,0.716198
stem,0.697794,0.668983,0.721163,0.693258,0.755901,0.725703,0.651885,0.624201,0.721131,0.693376
lemma,0.678071,0.65337,0.69458,0.670586,0.719405,0.694154,0.645957,0.621713,0.694557,0.670669


In [9]:
results_df.groupby('tokenizer').mean().sort_values('test_f1', ascending=False)

Unnamed: 0_level_0,train_f1,test_f1,train_acc,test_acc,train_precision,test_precision,train_recall,test_recall,train_roc_auc,test_roc_auc
tokenizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
bpe,0.742533,0.698972,0.758486,0.716106,0.788203,0.742869,0.704023,0.661867,0.75846,0.716198
base,0.686523,0.660061,0.705973,0.680303,0.735046,0.707675,0.648498,0.622779,0.705946,0.6804


In [10]:
results_df.groupby('model').mean().sort_values('test_f1', ascending=False)

Unnamed: 0_level_0,train_f1,test_f1,train_acc,test_acc,train_precision,test_precision,train_recall,test_recall,train_roc_auc,test_roc_auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
xgb,0.796378,0.697528,0.805226,0.7104,0.835513,0.732305,0.761124,0.666246,0.805205,0.710475
logreg,0.703931,0.687026,0.713679,0.696628,0.730526,0.712677,0.67961,0.663552,0.713663,0.696685
svm,0.703318,0.684987,0.714048,0.69607,0.732826,0.714783,0.676623,0.658101,0.714026,0.696151
sparse_logreg,0.702805,0.683549,0.712822,0.694734,0.730263,0.712513,0.677771,0.657297,0.71281,0.694779
rf,0.610199,0.605582,0.662859,0.657385,0.725837,0.718887,0.530648,0.527332,0.662797,0.657608


In [25]:
results_df.to_csv('results.csv')