In [38]:
!pip3 install spacy -q
!pip3 install optuna -q
!pip3 install dvclive -q

In [55]:
import pandas as pd
import spacy
import optuna
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, precision_score
from dvclive import Live

import warnings
warnings.filterwarnings("ignore")

In [40]:
DATAPATH = 'data/'
SEED = 53

In [41]:
data = pd.read_csv(DATAPATH + 'Ethos_Dataset_Binary.csv', delimiter=';')
data.head(3)

Unnamed: 0,comment,isHate
0,You should know women's sports are a joke,1.0
1,You look like Sloth with deeper Down’s syndrome,1.0
2,You look like Russian and speak like Indian. B...,1.0


## Data preprocessing

In [42]:
data['isHate'].value_counts().iloc[:5]

isHate
0.000000    354
1.000000    163
0.166667    106
0.833333    100
0.333333     80
Name: count, dtype: int64

Оставим только 0 и 1 

In [43]:
data = data[data['isHate'].isin([0, 1])]

In [44]:
train, test = train_test_split(data, stratify=data['isHate'], test_size=50, random_state=SEED)
train, val = train_test_split(train, stratify=train['isHate'], test_size=50, random_state=SEED)

In [45]:
print(train.shape[0], val.shape[0], test.shape[0])

417 50 50


In [46]:
spacy.cli.download("en_core_web_sm")
spacy_en = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
stopwords = spacy_en.Defaults.stop_words

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 25.0 MB/s eta 0:00:00
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [47]:
def prep_text(df, text_field, spc, sw=stopwords):
    df[text_field] = df[text_field].apply(
        lambda text: " ".join(
            token.lemma_ for token in spc(text)
            if (not token.is_punct) and (token not in sw)
        )
    )
    return df

In [48]:
train = prep_text(train, "comment", spacy_en)
val = prep_text(val, "comment", spacy_en)
test = prep_text(test, "comment", spacy_en)

In [49]:
train.head(3)

Unnamed: 0,comment,isHate
790,not only the politician be go into the wood ch...,0.0
945,delightful bring back some memory have a...,0.0
67,or when the man walk up to the little girl wit...,1.0


## Results with default parameters

In [50]:
pipe = Pipeline([('vectorizer', TfidfVectorizer()), ('model', LogisticRegression(random_state=SEED))])
pipe.fit(train['comment'], train['isHate'])
val_preds = pipe.predict(val['comment'])
test_preds = pipe.predict(test['comment'])

In [51]:
vect_params = {key: pipe['vectorizer'].get_params().get(key) for key in ['analyzer', 'ngram_range']}
logreg_params = {key: pipe['model'].get_params().get(key) for key in ['C', 'max_iter', 'solver']}
with Live(save_dvc_exp=True) as live:
    for param_name, param_value in vect_params.items():
        live.log_param(param_name, param_value)
    for param_name, param_value in logreg_params.items():
        live.log_param(param_name, param_value)
    live.log_metric("recall", recall_score(test['isHate'], test_preds))
    live.log_metric("precision", precision_score(test['isHate'], test_preds))
    live.log_metric("f1_score", f1_score(test['isHate'], test_preds))

	requirements.txt, .gitignore, .dvcignore, HW1_experiment_management/experiment.ipynb, HW1_experiment_management/data/Ethos_Dataset_Binary.csv.dvc, HW1_experiment_management/data/.gitignore, .dvc/config, .dvc/.gitignore


In [52]:
f1_score(test['isHate'], test_preds)

0.2222222222222222

## Hyperparameters search with optuna

In [53]:
def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key="best_model", value=trial.user_attrs["best_model"])
        
def objective(trial):
    vect_params = {
        "analyzer": trial.suggest_categorical("analyzer", ["word", "char", "char_wb"]),
        "ngram_range": trial.suggest_categorical(
            "ngram_range", [(1, 1), (1, 2), (2, 2), (2, 3), (2, 4), (2, 5)]
        )
    }
    logreg_params = {
        "C": trial.suggest_float("C", 0.01, 10),
        "max_iter": trial.suggest_int("max_iter", 100, 500),
        "solver": trial.suggest_categorical(
            "solver", ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"]
        ),
        "random_state": SEED
    }
    pipe = Pipeline([
        ('vectorizer', TfidfVectorizer(**vect_params)), 
        ('model', LogisticRegression(**logreg_params))
    ])

    pipe.fit(train['comment'], train['isHate'])
    trial.set_user_attr(key="best_model", value=pipe)
    val_preds = pipe.predict(val['comment'])
    metric = f1_score(val['isHate'], val_preds)
    return metric

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, callbacks=[callback])

best_model=study.user_attrs["best_model"]
study.best_params

[I 2023-06-06 21:09:08,813] A new study created in memory with name: no-name-1a7c4e79-525e-4888-91f3-2b9a9b01735b
[I 2023-06-06 21:09:09,872] Trial 0 finished with value: 0.6896551724137931 and parameters: {'analyzer': 'char_wb', 'ngram_range': (2, 2), 'C': 6.874761416523821, 'max_iter': 193, 'solver': 'newton-cholesky'}. Best is trial 0 with value: 0.6896551724137931.
[I 2023-06-06 21:09:10,016] Trial 1 finished with value: 0.64 and parameters: {'analyzer': 'char', 'ngram_range': (2, 3), 'C': 2.574007375051231, 'max_iter': 374, 'solver': 'liblinear'}. Best is trial 0 with value: 0.6896551724137931.
[I 2023-06-06 21:09:10,108] Trial 2 finished with value: 0.6923076923076923 and parameters: {'analyzer': 'char_wb', 'ngram_range': (2, 3), 'C': 2.1066233832363688, 'max_iter': 379, 'solver': 'liblinear'}. Best is trial 2 with value: 0.6923076923076923.
[I 2023-06-06 21:09:10,215] Trial 3 finished with value: 0.7333333333333334 and parameters: {'analyzer': 'char', 'ngram_range': (1, 2), 'C':

{'analyzer': 'char',
 'ngram_range': (2, 3),
 'C': 8.515864014557286,
 'max_iter': 393,
 'solver': 'lbfgs'}

In [54]:
test_preds = best_model.predict(test['comment'])

In [58]:
joblib.dump(best_model, 'best_model.pkl')

with Live(save_dvc_exp=True) as live:
    for param_name, param_value in study.best_params.items():
        live.log_param(param_name, param_value)
    live.log_metric("recall", recall_score(test['isHate'], test_preds))
    live.log_metric("precision", precision_score(test['isHate'], test_preds))
    live.log_metric("f1_score", f1_score(test['isHate'], test_preds))
    live.log_artifact("best_model.pkl", type="model", name='baseline', desc="tfidf+logreg")

	requirements.txt, .gitignore, .dvcignore, HW1_experiment_management/experiment.ipynb, HW1_experiment_management/data/Ethos_Dataset_Binary.csv.dvc, HW1_experiment_management/data/.gitignore, .dvc/config, .dvc/.gitignore


In [57]:
f1_score(test['isHate'], test_preds)

0.4800000000000001