In [1]:
from argparse import Namespace
from pathlib import Path
import joblib
import tempfile

In [2]:
import mlflow

In [3]:
# Specify arguments
args = Namespace(
    lower=True,
    stem=False,
    analyzer="char",
    ngram_max_range=7,
    alpha=1e-4,
    learning_rate=1e-1,
    power_t=0.1,
    num_epochs=100
)

In [4]:
# Set tracking URI
MODEL_REGISTRY = Path("experiments")
Path(MODEL_REGISTRY).mkdir(exist_ok=True) # create experiments dir
mlflow.set_tracking_uri("file://" + str(MODEL_REGISTRY.absolute()))

In [5]:
!mlflow --version

mlflow, version 2.1.1


In [6]:
import random
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import numpy as np
import json
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support,log_loss
from sklearn.model_selection import train_test_split
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from sklearn import metrics
from sklearn.linear_model import SGDClassifier

In [7]:
# nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
stemmer = PorterStemmer()

def preprocess(df, lower, stem, min_freq):
    """Preprocess the data."""
    df["text"] = df.title + " " + df.description  # feature engineering
    df.text = df.text.apply(clean_text, lower=lower, stem=stem)  # clean text
    tags = Counter(df.tag.values)

    # Replace OOS tags with `other`
    oos_tags = [item for item in df.tag.unique() if item not in ACCEPTED_TAGS]
    df.tag = df.tag.apply(lambda x: "other" if x in oos_tags else x)

    # Replace tags below min_freq with `other`
    tags_above_freq = Counter(tag for tag in tags.elements()
                            if (tags[tag] >= min_freq))
    df.tag = df.tag.apply(lambda tag: tag if tag in tags_above_freq else None)
    df.tag = df.tag.fillna("other")

    return df

# Accepted tags (external constraint)
ACCEPTED_TAGS = ["natural-language-processing", "computer-vision", "mlops", "graph-learning"]

# Minimum frequency required for a tag
min_freq = 75

def clean_text(text, lower=True, stem=False, stopwords=STOPWORDS):
    """Clean raw text."""
    # Lower
    if lower:
        text = text.lower()

    # Remove stopwords
    if len(stopwords):
        pattern = re.compile(r'\b(' + r"|".join(stopwords) + r")\b\s*")
        text = pattern.sub('', text)

    # Spacing and filters
    text = re.sub(
        r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text
    )  # add spacing between objects to be filtered
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()  # strip white space at the ends

    # Remove links
    text = re.sub(r"http\S+", "", text)

    # Stemming
    if stem:
        text = " ".join([stemmer.stem(word, to_lowercase=lower) for word in text.split(" ")])

    return text

class LabelEncoder(object):
    """Encode labels into unique indices"""
    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index or {}  # mutable defaults ;)
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())

    def __len__(self):
        return len(self.class_to_index)

    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"

    def fit(self, y):
        classes = np.unique(y)
        for i, class_ in enumerate(classes):
            self.class_to_index[class_] = i
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        return self

    def encode(self, y):
        encoded = np.zeros((len(y)), dtype=int)
        for i, item in enumerate(y):
            encoded[i] = self.class_to_index[item]
        return encoded

    def decode(self, y):
        classes = []
        for i, item in enumerate(y):
            classes.append(self.index_to_class[item])
        return classes

    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {"class_to_index": self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)

    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)
    
    
def get_data_splits(X, y, train_size=0.7):
    """Generate balanced data splits."""
    X_train, X_, y_train, y_ = train_test_split(
        X, y, train_size=train_size, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(
        X_, y_, train_size=0.5, stratify=y_)
    return X_train, X_val, X_test, y_train, y_val, y_test


# Custom predict function
def custom_predict(y_prob, threshold, index):
    """Custom predict function that defaults
    to an index if conditions are not met."""
    y_pred = [np.argmax(p) if max(p) > threshold else index for p in y_prob]
    return np.array(y_pred)


In [8]:
def train(args,trial=None):
    """Train model on data."""

    # Setup
    df = pd.read_csv(r"../data/labeled_projects.csv")
    df = df.sample(frac=1).reset_index(drop=True)
    df = preprocess(df, lower=True, stem=False, min_freq=min_freq)
    label_encoder = LabelEncoder().fit(df.tag)
    X_train, X_val, X_test, y_train, y_val, y_test = \
        get_data_splits(X=df.text.to_numpy(), y=label_encoder.encode(df.tag))

    # Tf-idf
    vectorizer = TfidfVectorizer(analyzer=args.analyzer, ngram_range=(2,args.ngram_max_range))  # char n-grams
    X_train = vectorizer.fit_transform(X_train)
    X_val = vectorizer.transform(X_val)
    X_test = vectorizer.transform(X_test)

    # Oversample
    oversample = RandomOverSampler(sampling_strategy="all")
    X_over, y_over = oversample.fit_resample(X_train, y_train)

    # Model
    model = SGDClassifier(
        loss="log", penalty="l1", alpha=args.alpha, max_iter=1,
        learning_rate="constant", eta0=args.learning_rate, power_t=args.power_t,
        warm_start=True)

    # Training
    for epoch in range(args.num_epochs):
        model.fit(X_over, y_over)
        train_loss = log_loss(y_train, model.predict_proba(X_train))
        val_loss = log_loss(y_val, model.predict_proba(X_val))
        if not epoch%10:
            print(
                f"Epoch: {epoch:02d} | "
                f"train_loss: {train_loss:.5f}, "
                f"val_loss: {val_loss:.5f}"
            )

        # Log
        if not trial:
            mlflow.log_metrics({"train_loss": train_loss, "val_loss": val_loss}, step=epoch)

        # Pruning (for optimization in next section)
        if trial:
            trial.report(val_loss, epoch)
            if trial.should_prune():
                raise optuna.TrialPruned()

    # Threshold
    y_pred = model.predict(X_val)
    y_prob = model.predict_proba(X_val)
    args.threshold = np.quantile(
        [y_prob[i][j] for i, j in enumerate(y_pred)], q=0.25)  # Q1

    # Evaluation
    other_index = label_encoder.class_to_index["other"]
    y_prob = model.predict_proba(X_test)
    y_pred = custom_predict(y_prob=y_prob, threshold=args.threshold, index=other_index)
    metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
    performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
    print (json.dumps(performance, indent=2))

    return {
        "args": args,
        "label_encoder": label_encoder,
        "vectorizer": vectorizer,
        "model": model,
        "performance": performance
    }

In [9]:
# Set experiment
mlflow.set_experiment(experiment_name="baselines")

<Experiment: artifact_location='file:///Users/alokrajgupta/test/mlops-test/notebooks/experiments/757401905687622610', creation_time=1675516274169, experiment_id='757401905687622610', last_update_time=1675516274169, lifecycle_stage='active', name='baselines', tags={}>

In [10]:
def save_dict(d, filepath):
    """Save dict to a json file."""
    with open(filepath, "w") as fp:
        json.dump(d, indent=2, sort_keys=False, fp=fp)

In [11]:
# Tracking
with mlflow.start_run(run_name="sgd-1"):

    # Train & evaluate
    artifacts = train(args=args)

    # Log key metrics
    mlflow.log_metrics({"precision": artifacts["performance"]["precision"]})
    mlflow.log_metrics({"recall": artifacts["performance"]["recall"]})
    mlflow.log_metrics({"f1": artifacts["performance"]["f1"]})

    # Log artifacts
    with tempfile.TemporaryDirectory() as dp:
        artifacts["label_encoder"].save(Path(dp, "label_encoder.json"))
        joblib.dump(artifacts["vectorizer"], Path(dp, "vectorizer.pkl"))
        joblib.dump(artifacts["model"], Path(dp, "model.pkl"))
        save_dict(artifacts["performance"], Path(dp, "performance.json"))
        mlflow.log_artifacts(dp)

    # Log parameters
    mlflow.log_params(vars(artifacts["args"]))


Epoch: 00 | train_loss: 1.23433, val_loss: 1.23894
Epoch: 10 | train_loss: 0.67624, val_loss: 0.69655
Epoch: 20 | train_loss: 0.54456, val_loss: 0.57964
Epoch: 30 | train_loss: 0.48045, val_loss: 0.52643
Epoch: 40 | train_loss: 0.44143, val_loss: 0.49518
Epoch: 50 | train_loss: 0.41417, val_loss: 0.47341
Epoch: 60 | train_loss: 0.39382, val_loss: 0.45814
Epoch: 70 | train_loss: 0.37791, val_loss: 0.44625
Epoch: 80 | train_loss: 0.36443, val_loss: 0.43689
Epoch: 90 | train_loss: 0.35420, val_loss: 0.42943
{
  "precision": 0.8796715060302017,
  "recall": 0.7638888888888888,
  "f1": 0.7937561082799719
}


In [12]:
# !mlflow server -h 0.0.0.0 -p 8000 --backend-store-uri $PWD/experiments/

In [13]:
def load_dict(filepath):
    """Load a dict from a json file."""
    with open(filepath, "r") as fp:
        d = json.load(fp)
    return d

In [14]:
# Load all runs from experiment
experiment_id = mlflow.get_experiment_by_name("baselines").experiment_id
all_runs = mlflow.search_runs(experiment_ids=experiment_id, order_by=["metrics.val_loss ASC"])
# print (all_runs)


Traceback (most recent call last):
  File "/Users/alokrajgupta/miniconda3/envs/test/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 856, in _list_run_infos
    run_info = self._get_run_info_from_dir(r_dir)
  File "/Users/alokrajgupta/miniconda3/envs/test/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 663, in _get_run_info_from_dir
    meta = FileStore._read_yaml(run_dir, FileStore.META_DATA_FILE_NAME)
  File "/Users/alokrajgupta/miniconda3/envs/test/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 1082, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "/Users/alokrajgupta/miniconda3/envs/test/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 1075, in _read_helper
    result = read_yaml(root, file_name)
  File "/Users/alokrajgupta/miniconda3/envs/test/lib/python3.9/site-packages/mlflow/utils/file_utils.py", line 213, in read_yaml
    raise MissingConfigExcep

In [15]:
# # Best run
# best_run_id = all_runs.iloc[0].run_id
# best_run = mlflow.get_run(run_id=best_run_id)
# client = mlflow.tracking.MlflowClient()
# with tempfile.TemporaryDirectory() as dp:
#     client.download_artifacts(run_id=best_run_id, path="", dst_path=dp)
#     vectorizer = joblib.load(Path(dp, "vectorizer.pkl"))
#     label_encoder = LabelEncoder.load(fp=Path(dp, "label_encoder.json"))
#     model = joblib.load(Path(dp, "model.pkl"))
#     performance = load_dict(filepath=Path(dp, "performance.json"))


## Hyperparameter tuning

In [16]:
import optuna

In [17]:
def objective(args, trial):
    """Objective function for optimization trials."""
    # Parameters to tune
    args.analyzer = trial.suggest_categorical("analyzer", ["word", "char", "char_wb"])
    args.ngram_max_range = trial.suggest_int("ngram_max_range", 3, 10)
    args.learning_rate = trial.suggest_loguniform("learning_rate", 1e-2, 1e0)
    args.power_t = trial.suggest_uniform("power_t", 0.1, 0.5)

    # Train & evaluate
    artifacts = train(args=args,trial=trial)

    # Set additional attributes
    performance = artifacts["performance"]
    print(json.dumps(performance, indent=2))
    trial.set_user_attr("precision", performance["precision"])
    trial.set_user_attr("recall", performance["recall"])
    trial.set_user_attr("f1", performance["f1"])

    return performance["f1"]

In [18]:
from numpyencoder import NumpyEncoder
from optuna.integration.mlflow import MLflowCallback

In [19]:
NUM_TRIALS = 20  # small sample for now

In [20]:
# Optimize
pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)
study = optuna.create_study(study_name="optimization", direction="maximize", pruner=pruner)
mlflow_callback = MLflowCallback(
    tracking_uri=mlflow.get_tracking_uri(), metric_name="f1")
study.optimize(lambda trial: objective(args, trial),
            n_trials=NUM_TRIALS,
            callbacks=[mlflow_callback])

[32m[I 2023-02-05 12:11:51,909][0m A new study created in memory with name: optimization[0m


Epoch: 00 | train_loss: 1.18136, val_loss: 1.19496
Epoch: 10 | train_loss: 0.55399, val_loss: 0.60416
Epoch: 20 | train_loss: 0.42040, val_loss: 0.48856
Epoch: 30 | train_loss: 0.35703, val_loss: 0.43629
Epoch: 40 | train_loss: 0.31867, val_loss: 0.40555
Epoch: 50 | train_loss: 0.29292, val_loss: 0.38534
Epoch: 60 | train_loss: 0.27408, val_loss: 0.37100
Epoch: 70 | train_loss: 0.25940, val_loss: 0.35966
Epoch: 80 | train_loss: 0.24811, val_loss: 0.35123


[32m[I 2023-02-05 12:11:52,830][0m Trial 0 finished with value: 0.7512827780684924 and parameters: {'analyzer': 'char', 'ngram_max_range': 4, 'learning_rate': 0.08716458913990074, 'power_t': 0.2247648043438678}. Best is trial 0 with value: 0.7512827780684924.[0m


Epoch: 90 | train_loss: 0.23876, val_loss: 0.34423
{
  "precision": 0.8642625580125579,
  "recall": 0.7222222222222222,
  "f1": 0.7512827780684924
}
{
  "precision": 0.8642625580125579,
  "recall": 0.7222222222222222,
  "f1": 0.7512827780684924
}
Epoch: 00 | train_loss: 1.25090, val_loss: 1.27463
Epoch: 10 | train_loss: 0.65134, val_loss: 0.90028
Epoch: 20 | train_loss: 0.48991, val_loss: 0.85803
Epoch: 30 | train_loss: 0.41595, val_loss: 0.85901
Epoch: 40 | train_loss: 0.37551, val_loss: 0.86910
Epoch: 50 | train_loss: 0.34708, val_loss: 0.88039
Epoch: 60 | train_loss: 0.33065, val_loss: 0.89184
Epoch: 70 | train_loss: 0.31834, val_loss: 0.90110
Epoch: 80 | train_loss: 0.30791, val_loss: 0.90941
Epoch: 90 | train_loss: 0.30255, val_loss: 0.91752


[32m[I 2023-02-05 12:11:53,173][0m Trial 1 finished with value: 0.6602132435465768 and parameters: {'analyzer': 'word', 'ngram_max_range': 5, 'learning_rate': 0.26745695217967985, 'power_t': 0.10355489836434235}. Best is trial 0 with value: 0.7512827780684924.[0m


{
  "precision": 0.8001976986982985,
  "recall": 0.625,
  "f1": 0.6602132435465768
}
{
  "precision": 0.8001976986982985,
  "recall": 0.625,
  "f1": 0.6602132435465768
}
Epoch: 00 | train_loss: 1.37877, val_loss: 1.37956
Epoch: 10 | train_loss: 1.30068, val_loss: 1.30676
Epoch: 20 | train_loss: 1.22400, val_loss: 1.23796
Epoch: 30 | train_loss: 1.15427, val_loss: 1.17872


[32m[I 2023-02-05 12:11:53,561][0m Trial 2 finished with value: 0.6944460688758933 and parameters: {'analyzer': 'word', 'ngram_max_range': 6, 'learning_rate': 0.017974075165303863, 'power_t': 0.24924802934267862}. Best is trial 0 with value: 0.7512827780684924.[0m


Epoch: 40 | train_loss: 1.09150, val_loss: 1.12842
Epoch: 50 | train_loss: 1.03586, val_loss: 1.08595
Epoch: 60 | train_loss: 0.98640, val_loss: 1.05027
Epoch: 70 | train_loss: 0.94232, val_loss: 1.01981
Epoch: 80 | train_loss: 0.90301, val_loss: 0.99370
Epoch: 90 | train_loss: 0.86741, val_loss: 0.97141
{
  "precision": 0.7901235265365699,
  "recall": 0.6666666666666666,
  "f1": 0.6944460688758933
}
{
  "precision": 0.7901235265365699,
  "recall": 0.6666666666666666,
  "f1": 0.6944460688758933
}
Epoch: 00 | train_loss: 1.29661, val_loss: 1.30367
Epoch: 10 | train_loss: 0.76971, val_loss: 0.90992
Epoch: 20 | train_loss: 0.58303, val_loss: 0.80982
Epoch: 30 | train_loss: 0.48791, val_loss: 0.77260
Epoch: 40 | train_loss: 0.43218, val_loss: 0.75668
Epoch: 50 | train_loss: 0.39587, val_loss: 0.74994
Epoch: 60 | train_loss: 0.37159, val_loss: 0.74846
Epoch: 70 | train_loss: 0.35204, val_loss: 0.74877


[32m[I 2023-02-05 12:11:53,920][0m Trial 3 finished with value: 0.7254660910574889 and parameters: {'analyzer': 'word', 'ngram_max_range': 5, 'learning_rate': 0.1804275658227044, 'power_t': 0.2983442872419482}. Best is trial 0 with value: 0.7512827780684924.[0m


Epoch: 80 | train_loss: 0.33701, val_loss: 0.74895
Epoch: 90 | train_loss: 0.32620, val_loss: 0.75072
{
  "precision": 0.8197653295214271,
  "recall": 0.6944444444444444,
  "f1": 0.7254660910574889
}
{
  "precision": 0.8197653295214271,
  "recall": 0.6944444444444444,
  "f1": 0.7254660910574889
}
Epoch: 00 | train_loss: 0.54102, val_loss: 0.67715
Epoch: 10 | train_loss: 0.18473, val_loss: 0.43338
Epoch: 20 | train_loss: 0.15029, val_loss: 0.41687
Epoch: 30 | train_loss: 0.13873, val_loss: 0.41190
Epoch: 40 | train_loss: 0.12639, val_loss: 0.40846
Epoch: 50 | train_loss: 0.12256, val_loss: 0.41011
Epoch: 60 | train_loss: 0.11865, val_loss: 0.41114
Epoch: 70 | train_loss: 0.11613, val_loss: 0.41331
Epoch: 80 | train_loss: 0.11179, val_loss: 0.41344


[32m[I 2023-02-05 12:11:54,551][0m Trial 4 finished with value: 0.7776440395339018 and parameters: {'analyzer': 'char', 'ngram_max_range': 3, 'learning_rate': 0.8187011957013693, 'power_t': 0.46221054324488897}. Best is trial 4 with value: 0.7776440395339018.[0m


Epoch: 90 | train_loss: 0.11180, val_loss: 0.41381
{
  "precision": 0.8914503834642724,
  "recall": 0.7430555555555556,
  "f1": 0.7776440395339018
}
{
  "precision": 0.8914503834642724,
  "recall": 0.7430555555555556,
  "f1": 0.7776440395339018
}
Epoch: 00 | train_loss: 1.20769, val_loss: 1.23733
Epoch: 10 | train_loss: 0.62988, val_loss: 0.74900
Epoch: 20 | train_loss: 0.50105, val_loss: 0.64661
Epoch: 30 | train_loss: 0.43655, val_loss: 0.59668
Epoch: 40 | train_loss: 0.39773, val_loss: 0.56614
Epoch: 50 | train_loss: 0.37048, val_loss: 0.54480
Epoch: 60 | train_loss: 0.34980, val_loss: 0.52946
Epoch: 70 | train_loss: 0.33405, val_loss: 0.51744
Epoch: 80 | train_loss: 0.32165, val_loss: 0.50779
Epoch: 90 | train_loss: 0.31110, val_loss: 0.49994


[32m[I 2023-02-05 12:11:55,852][0m Trial 5 finished with value: 0.8138400321335625 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 10, 'learning_rate': 0.08039010509791486, 'power_t': 0.3116341846249877}. Best is trial 5 with value: 0.8138400321335625.[0m


{
  "precision": 0.8935619374974078,
  "recall": 0.7916666666666666,
  "f1": 0.8138400321335625
}
{
  "precision": 0.8935619374974078,
  "recall": 0.7916666666666666,
  "f1": 0.8138400321335625
}
Epoch: 00 | train_loss: 1.30540, val_loss: 1.32065
Epoch: 10 | train_loss: 0.82423, val_loss: 0.97458
Epoch: 20 | train_loss: 0.65026, val_loss: 0.88670
Epoch: 30 | train_loss: 0.56268, val_loss: 0.85673
Epoch: 40 | train_loss: 0.51122, val_loss: 0.84562
Epoch: 50 | train_loss: 0.47744, val_loss: 0.84286
Epoch: 60 | train_loss: 0.45344, val_loss: 0.84284
Epoch: 70 | train_loss: 0.43658, val_loss: 0.84391
Epoch: 80 | train_loss: 0.42458, val_loss: 0.84677


[32m[I 2023-02-05 12:11:56,300][0m Trial 6 finished with value: 0.7450401297172765 and parameters: {'analyzer': 'word', 'ngram_max_range': 8, 'learning_rate': 0.1946134411675939, 'power_t': 0.4139953839336108}. Best is trial 5 with value: 0.8138400321335625.[0m
[32m[I 2023-02-05 12:11:56,429][0m Trial 7 pruned. [0m


Epoch: 90 | train_loss: 0.41508, val_loss: 0.84953
{
  "precision": 0.8602175602175602,
  "recall": 0.7152777777777778,
  "f1": 0.7450401297172765
}
{
  "precision": 0.8602175602175602,
  "recall": 0.7152777777777778,
  "f1": 0.7450401297172765
}
Epoch: 00 | train_loss: 0.81777, val_loss: 0.93179
Epoch: 00 | train_loss: 1.10834, val_loss: 1.18253
Epoch: 10 | train_loss: 0.53471, val_loss: 0.86667
Epoch: 20 | train_loss: 0.45512, val_loss: 0.88009
Epoch: 30 | train_loss: 0.42513, val_loss: 0.89998
Epoch: 40 | train_loss: 0.40300, val_loss: 0.91915
Epoch: 50 | train_loss: 0.39306, val_loss: 0.93750


[32m[I 2023-02-05 12:11:56,927][0m Trial 8 finished with value: 0.649214701987682 and parameters: {'analyzer': 'word', 'ngram_max_range': 10, 'learning_rate': 0.7039102223514723, 'power_t': 0.21556833797404892}. Best is trial 5 with value: 0.8138400321335625.[0m


Epoch: 60 | train_loss: 0.37735, val_loss: 0.95397
Epoch: 70 | train_loss: 0.36391, val_loss: 0.96569
Epoch: 80 | train_loss: 0.35561, val_loss: 0.97704
Epoch: 90 | train_loss: 0.34960, val_loss: 0.98709
{
  "precision": 0.7829431484694642,
  "recall": 0.625,
  "f1": 0.649214701987682
}
{
  "precision": 0.7829431484694642,
  "recall": 0.625,
  "f1": 0.649214701987682
}
Epoch: 00 | train_loss: 1.27672, val_loss: 1.29041
Epoch: 10 | train_loss: 0.69973, val_loss: 0.92666
Epoch: 20 | train_loss: 0.51671, val_loss: 0.87071
Epoch: 30 | train_loss: 0.42898, val_loss: 0.86694
Epoch: 40 | train_loss: 0.37698, val_loss: 0.87683
Epoch: 50 | train_loss: 0.34354, val_loss: 0.88902
Epoch: 60 | train_loss: 0.31984, val_loss: 0.90169
Epoch: 70 | train_loss: 0.30273, val_loss: 0.91437
Epoch: 80 | train_loss: 0.29038, val_loss: 0.92584


[32m[I 2023-02-05 12:11:57,245][0m Trial 9 finished with value: 0.6858520623439249 and parameters: {'analyzer': 'word', 'ngram_max_range': 4, 'learning_rate': 0.1942570109068206, 'power_t': 0.2331847233532298}. Best is trial 5 with value: 0.8138400321335625.[0m


Epoch: 90 | train_loss: 0.28079, val_loss: 0.93576
{
  "precision": 0.8099262397991213,
  "recall": 0.6458333333333334,
  "f1": 0.6858520623439249
}
{
  "precision": 0.8099262397991213,
  "recall": 0.6458333333333334,
  "f1": 0.6858520623439249
}
Epoch: 00 | train_loss: 1.31455, val_loss: 1.33141
Epoch: 10 | train_loss: 0.86162, val_loss: 0.95270
Epoch: 20 | train_loss: 0.69372, val_loss: 0.80656
Epoch: 30 | train_loss: 0.60472, val_loss: 0.73081
Epoch: 40 | train_loss: 0.54779, val_loss: 0.68355
Epoch: 50 | train_loss: 0.50689, val_loss: 0.65037
Epoch: 60 | train_loss: 0.47601, val_loss: 0.62594
Epoch: 70 | train_loss: 0.45162, val_loss: 0.60701
Epoch: 80 | train_loss: 0.43175, val_loss: 0.59204
Epoch: 90 | train_loss: 0.41528, val_loss: 0.57971


[32m[I 2023-02-05 12:11:58,571][0m Trial 10 finished with value: 0.8017229320012969 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 10, 'learning_rate': 0.03292221034891384, 'power_t': 0.3510625165479022}. Best is trial 5 with value: 0.8138400321335625.[0m


{
  "precision": 0.8878264023210831,
  "recall": 0.7847222222222222,
  "f1": 0.8017229320012969
}
{
  "precision": 0.8878264023210831,
  "recall": 0.7847222222222222,
  "f1": 0.8017229320012969
}
Epoch: 00 | train_loss: 1.32194, val_loss: 1.32284
Epoch: 10 | train_loss: 0.88957, val_loss: 0.88581
Epoch: 20 | train_loss: 0.72065, val_loss: 0.72546
Epoch: 30 | train_loss: 0.62906, val_loss: 0.64302
Epoch: 40 | train_loss: 0.57002, val_loss: 0.59206
Epoch: 50 | train_loss: 0.52784, val_loss: 0.55679
Epoch: 60 | train_loss: 0.49586, val_loss: 0.53058
Epoch: 70 | train_loss: 0.47048, val_loss: 0.51033
Epoch: 80 | train_loss: 0.44977, val_loss: 0.49416
Epoch: 90 | train_loss: 0.43246, val_loss: 0.48090


[32m[I 2023-02-05 12:11:59,901][0m Trial 11 finished with value: 0.7283025535126861 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 10, 'learning_rate': 0.030959247511369754, 'power_t': 0.351536113811276}. Best is trial 5 with value: 0.8138400321335625.[0m


{
  "precision": 0.8689727645059698,
  "recall": 0.6944444444444444,
  "f1": 0.7283025535126861
}
{
  "precision": 0.8689727645059698,
  "recall": 0.6944444444444444,
  "f1": 0.7283025535126861
}
Epoch: 00 | train_loss: 1.27833, val_loss: 1.30139
Epoch: 10 | train_loss: 0.74840, val_loss: 0.85605
Epoch: 20 | train_loss: 0.59261, val_loss: 0.73320
Epoch: 30 | train_loss: 0.51350, val_loss: 0.67181
Epoch: 40 | train_loss: 0.46399, val_loss: 0.63363
Epoch: 50 | train_loss: 0.42928, val_loss: 0.60734
Epoch: 60 | train_loss: 0.40351, val_loss: 0.58759
Epoch: 70 | train_loss: 0.38330, val_loss: 0.57201


[32m[I 2023-02-05 12:12:01,142][0m Trial 12 finished with value: 0.8243159365928978 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 8, 'learning_rate': 0.04637257794439374, 'power_t': 0.32504761965797657}. Best is trial 12 with value: 0.8243159365928978.[0m


Epoch: 80 | train_loss: 0.36717, val_loss: 0.55961
Epoch: 90 | train_loss: 0.35361, val_loss: 0.54943
{
  "precision": 0.882114292745522,
  "recall": 0.8055555555555556,
  "f1": 0.8243159365928978
}
{
  "precision": 0.882114292745522,
  "recall": 0.8055555555555556,
  "f1": 0.8243159365928978
}
Epoch: 00 | train_loss: 1.20432, val_loss: 1.22460
Epoch: 10 | train_loss: 0.61745, val_loss: 0.69402
Epoch: 20 | train_loss: 0.48990, val_loss: 0.58329
Epoch: 30 | train_loss: 0.42791, val_loss: 0.53117
Epoch: 40 | train_loss: 0.38917, val_loss: 0.49967
Epoch: 50 | train_loss: 0.36301, val_loss: 0.47850
Epoch: 60 | train_loss: 0.34324, val_loss: 0.46298
Epoch: 70 | train_loss: 0.32779, val_loss: 0.45150


[32m[I 2023-02-05 12:12:02,373][0m Trial 13 finished with value: 0.814680427887975 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 8, 'learning_rate': 0.07896784245850681, 'power_t': 0.3082965681460817}. Best is trial 12 with value: 0.8243159365928978.[0m


Epoch: 80 | train_loss: 0.31532, val_loss: 0.44223
Epoch: 90 | train_loss: 0.30493, val_loss: 0.43481
{
  "precision": 0.8838348765432098,
  "recall": 0.7986111111111112,
  "f1": 0.814680427887975
}
{
  "precision": 0.8838348765432098,
  "recall": 0.7986111111111112,
  "f1": 0.814680427887975
}
Epoch: 00 | train_loss: 1.27661, val_loss: 1.29705
Epoch: 10 | train_loss: 0.73611, val_loss: 0.87317
Epoch: 20 | train_loss: 0.57997, val_loss: 0.76292
Epoch: 30 | train_loss: 0.50159, val_loss: 0.71090
Epoch: 40 | train_loss: 0.45315, val_loss: 0.67977
Epoch: 50 | train_loss: 0.41907, val_loss: 0.65808
Epoch: 60 | train_loss: 0.39359, val_loss: 0.64245
Epoch: 70 | train_loss: 0.37393, val_loss: 0.63067
Epoch: 80 | train_loss: 0.35791, val_loss: 0.62130
Epoch: 90 | train_loss: 0.34466, val_loss: 0.61380


[32m[I 2023-02-05 12:12:03,620][0m Trial 14 finished with value: 0.8143817126529893 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 8, 'learning_rate': 0.04707005673215154, 'power_t': 0.15850194539639184}. Best is trial 12 with value: 0.8243159365928978.[0m


{
  "precision": 0.8919082125603865,
  "recall": 0.7916666666666666,
  "f1": 0.8143817126529893
}
{
  "precision": 0.8919082125603865,
  "recall": 0.7916666666666666,
  "f1": 0.8143817126529893
}
Epoch: 00 | train_loss: 1.36305, val_loss: 1.36531
Epoch: 10 | train_loss: 1.13120, val_loss: 1.14838
Epoch: 20 | train_loss: 0.97304, val_loss: 0.99817
Epoch: 30 | train_loss: 0.86693, val_loss: 0.89842
Epoch: 40 | train_loss: 0.79097, val_loss: 0.82823
Epoch: 50 | train_loss: 0.73380, val_loss: 0.77634
Epoch: 60 | train_loss: 0.68879, val_loss: 0.73609
Epoch: 70 | train_loss: 0.65204, val_loss: 0.70370
Epoch: 80 | train_loss: 0.62150, val_loss: 0.67699


[32m[I 2023-02-05 12:12:04,845][0m Trial 15 finished with value: 0.701228555395222 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 8, 'learning_rate': 0.010635672490824734, 'power_t': 0.4953561697480423}. Best is trial 12 with value: 0.8243159365928978.[0m


Epoch: 90 | train_loss: 0.59561, val_loss: 0.65457
{
  "precision": 0.8227832104172434,
  "recall": 0.6666666666666666,
  "f1": 0.701228555395222
}
{
  "precision": 0.8227832104172434,
  "recall": 0.6666666666666666,
  "f1": 0.701228555395222
}
Epoch: 00 | train_loss: 1.25535, val_loss: 1.26906
Epoch: 10 | train_loss: 0.69018, val_loss: 0.75540
Epoch: 20 | train_loss: 0.53889, val_loss: 0.62950
Epoch: 30 | train_loss: 0.46408, val_loss: 0.56929
Epoch: 40 | train_loss: 0.41801, val_loss: 0.53326
Epoch: 50 | train_loss: 0.38610, val_loss: 0.50858
Epoch: 60 | train_loss: 0.36264, val_loss: 0.49049
Epoch: 70 | train_loss: 0.34462, val_loss: 0.47645
Epoch: 80 | train_loss: 0.32993, val_loss: 0.46482


[32m[I 2023-02-05 12:12:06,017][0m Trial 16 finished with value: 0.783982914581506 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 7, 'learning_rate': 0.054976652400194705, 'power_t': 0.29137085141894276}. Best is trial 12 with value: 0.8243159365928978.[0m


Epoch: 90 | train_loss: 0.31773, val_loss: 0.45491
{
  "precision": 0.8881302521008404,
  "recall": 0.7569444444444444,
  "f1": 0.783982914581506
}
{
  "precision": 0.8881302521008404,
  "recall": 0.7569444444444444,
  "f1": 0.783982914581506
}
Epoch: 00 | train_loss: 1.33018, val_loss: 1.33960
Epoch: 10 | train_loss: 0.91265, val_loss: 0.98566
Epoch: 20 | train_loss: 0.73311, val_loss: 0.83542
Epoch: 30 | train_loss: 0.63410, val_loss: 0.75360
Epoch: 40 | train_loss: 0.57002, val_loss: 0.70117
Epoch: 50 | train_loss: 0.52429, val_loss: 0.66409
Epoch: 60 | train_loss: 0.48946, val_loss: 0.63646
Epoch: 70 | train_loss: 0.46204, val_loss: 0.61515
Epoch: 80 | train_loss: 0.43950, val_loss: 0.59778


[32m[I 2023-02-05 12:12:07,198][0m Trial 17 finished with value: 0.8310313266777816 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 7, 'learning_rate': 0.024890198831893116, 'power_t': 0.3955581983319367}. Best is trial 17 with value: 0.8310313266777816.[0m


Epoch: 90 | train_loss: 0.42083, val_loss: 0.58357
{
  "precision": 0.9114918357248887,
  "recall": 0.8055555555555556,
  "f1": 0.8310313266777816
}
{
  "precision": 0.9114918357248887,
  "recall": 0.8055555555555556,
  "f1": 0.8310313266777816
}
Epoch: 00 | train_loss: 1.34556, val_loss: 1.35282
Epoch: 10 | train_loss: 1.00075, val_loss: 1.05751
Epoch: 20 | train_loss: 0.82414, val_loss: 0.90503
Epoch: 30 | train_loss: 0.72054, val_loss: 0.81766
Epoch: 40 | train_loss: 0.65164, val_loss: 0.76132
Epoch: 50 | train_loss: 0.60145, val_loss: 0.72128
Epoch: 60 | train_loss: 0.56292, val_loss: 0.69106
Epoch: 70 | train_loss: 0.53217, val_loss: 0.66749
Epoch: 80 | train_loss: 0.50695, val_loss: 0.64860


[32m[I 2023-02-05 12:12:08,362][0m Trial 18 finished with value: 0.8019109545112544 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 7, 'learning_rate': 0.01719774093680508, 'power_t': 0.3982218677990397}. Best is trial 17 with value: 0.8310313266777816.[0m


Epoch: 90 | train_loss: 0.48566, val_loss: 0.63287
{
  "precision": 0.8831790123456789,
  "recall": 0.7777777777777778,
  "f1": 0.8019109545112544
}
{
  "precision": 0.8831790123456789,
  "recall": 0.7777777777777778,
  "f1": 0.8019109545112544
}
Epoch: 00 | train_loss: 1.33441, val_loss: 1.34462
Epoch: 10 | train_loss: 0.94384, val_loss: 1.00127
Epoch: 20 | train_loss: 0.77064, val_loss: 0.84353
Epoch: 30 | train_loss: 0.67413, val_loss: 0.75653
Epoch: 40 | train_loss: 0.61040, val_loss: 0.69999
Epoch: 50 | train_loss: 0.56465, val_loss: 0.65994
Epoch: 60 | train_loss: 0.52980, val_loss: 0.62977
Epoch: 70 | train_loss: 0.50199, val_loss: 0.60598
Epoch: 80 | train_loss: 0.47911, val_loss: 0.58684
Epoch: 90 | train_loss: 0.46015, val_loss: 0.57116


[32m[I 2023-02-05 12:12:09,690][0m Trial 19 finished with value: 0.7871657042724058 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 9, 'learning_rate': 0.023908160183656522, 'power_t': 0.3610571722581264}. Best is trial 17 with value: 0.8310313266777816.[0m


{
  "precision": 0.8624508978675646,
  "recall": 0.7638888888888888,
  "f1": 0.7871657042724058
}
{
  "precision": 0.8624508978675646,
  "recall": 0.7638888888888888,
  "f1": 0.7871657042724058
}


In [21]:
# All trials
trials_df = study.trials_dataframe()
trials_df = trials_df.sort_values(["user_attrs_f1"], ascending=False)  # sort by metric
trials_df.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_analyzer,params_learning_rate,params_ngram_max_range,params_power_t,user_attrs_f1,user_attrs_precision,user_attrs_recall,state
17,17,0.831031,2023-02-05 12:12:06.026211,2023-02-05 12:12:07.198577,0 days 00:00:01.172366,char_wb,0.02489,7,0.395558,0.831031,0.911492,0.805556,COMPLETE
12,12,0.824316,2023-02-05 12:11:59.910889,2023-02-05 12:12:01.142359,0 days 00:00:01.231470,char_wb,0.046373,8,0.325048,0.824316,0.882114,0.805556,COMPLETE
13,13,0.81468,2023-02-05 12:12:01.151478,2023-02-05 12:12:02.373716,0 days 00:00:01.222238,char_wb,0.078968,8,0.308297,0.81468,0.883835,0.798611,COMPLETE
14,14,0.814382,2023-02-05 12:12:02.383222,2023-02-05 12:12:03.620282,0 days 00:00:01.237060,char_wb,0.04707,8,0.158502,0.814382,0.891908,0.791667,COMPLETE
5,5,0.81384,2023-02-05 12:11:54.560693,2023-02-05 12:11:55.852614,0 days 00:00:01.291921,char_wb,0.08039,10,0.311634,0.81384,0.893562,0.791667,COMPLETE


In [22]:
# Best trial
print (f"Best value (f1): {study.best_trial.value}")
print (f"Best hyperparameters: {json.dumps(study.best_trial.params, indent=2)}")

Best value (f1): 0.8310313266777816
Best hyperparameters: {
  "analyzer": "char_wb",
  "ngram_max_range": 7,
  "learning_rate": 0.024890198831893116,
  "power_t": 0.3955581983319367
}


In [23]:
# Save best parameter values
args = {**args.__dict__, **study.best_trial.params}
print (json.dumps(args, indent=2, cls=NumpyEncoder))

{
  "lower": true,
  "stem": false,
  "analyzer": "char_wb",
  "ngram_max_range": 7,
  "alpha": 0.0001,
  "learning_rate": 0.024890198831893116,
  "power_t": 0.3955581983319367,
  "num_epochs": 100,
  "threshold": 0.5111814509219883
}
