In [None]:
!nvidia-smi

Thu Sep  2 00:31:04 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P8    31W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from collections import Counter, OrderedDict
import ipywidgets as widgets
import itertools
import json
import pandas as pd
from urllib.request import urlopen

In [None]:
# Load projects
url = "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/projects.json"
projects = json.loads(urlopen(url).read())
print (json.dumps(projects[-305], indent=2))

{
  "id": 2106,
  "created_on": "2020-08-08 15:06:18",
  "title": "Fast NST for Videos (+ person segmentation) \ud83c\udfa5 + \u26a1\ud83d\udcbb + \ud83c\udfa8 = \u2764\ufe0f",
  "description": "Create NST videos and pick separate styles for the person in the video and for the background.",
  "tags": [
    "code",
    "tutorial",
    "video",
    "computer-vision",
    "style-transfer",
    "neural-style-transfer"
  ]
}


In [None]:
# Create dataframe
df = pd.DataFrame(projects)
print (f"{len(df)} projects")
df.head(5)

2032 projects


Unnamed: 0,id,created_on,title,description,tags
0,1,2020-02-17 06:30:41,Machine Learning Basics,A practical set of notebooks on machine learni...,"[code, tutorial, keras, pytorch, tensorflow, d..."
1,2,2020-02-17 06:41:45,Deep Learning with Electronic Health Record (E...,A comprehensive look at recent machine learnin...,"[article, tutorial, deep-learning, health, ehr]"
2,3,2020-02-20 06:07:59,Automatic Parking Management using computer vi...,Detecting empty and parked spaces in car parki...,"[code, tutorial, video, python, machine-learni..."
3,4,2020-02-20 06:21:57,Easy street parking using region proposal netw...,Get a text on your phone whenever a nearby par...,"[code, tutorial, python, pytorch, machine-lear..."
4,5,2020-02-20 06:29:18,Deep Learning based parking management system ...,Fastai provides easy to use wrappers to quickl...,"[code, tutorial, fastai, deep-learning, parkin..."


In [None]:
# Load tags (Auxilary dataset)
# this dataset has the aliases for our tags, and has parent-child relationships to suggest relevant parent tags
url = "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/tags.json"
tags = json.loads(urlopen(url).read())
tags_dict = {}
for item in tags:
    key = item.pop("tag")
    tags_dict[key] = item
print (f"{len(tags_dict)} tags")

400 tags


In [None]:
@widgets.interact(tag=list(tags_dict.keys()))
def display_tag_details(tag='question-answering'):
    print (json.dumps(tags_dict[tag], indent=2))

interactive(children=(Dropdown(description='tag', index=283, options=('3d', 'action-localization', 'action-rec…

In [None]:
!pip install snorkel



In [None]:
from snorkel.labeling import labeling_function

@labeling_function()
def contains_tensorflow(text):
    condition = any(tag in text.lower() for tag in ("tensorflow", "tf"))
    return "tensorflow" if condition else None

In [None]:
# Input
df['text'] = df.title + " " + df.description

In [None]:

df.head()

Unnamed: 0,id,created_on,title,description,tags,text
0,1,2020-02-17 06:30:41,Machine Learning Basics,A practical set of notebooks on machine learni...,"[code, tutorial, keras, pytorch, tensorflow, d...",Machine Learning Basics A practical set of not...
1,2,2020-02-17 06:41:45,Deep Learning with Electronic Health Record (E...,A comprehensive look at recent machine learnin...,"[article, tutorial, deep-learning, health, ehr]",Deep Learning with Electronic Health Record (E...
2,3,2020-02-20 06:07:59,Automatic Parking Management using computer vi...,Detecting empty and parked spaces in car parki...,"[code, tutorial, video, python, machine-learni...",Automatic Parking Management using computer vi...
3,4,2020-02-20 06:21:57,Easy street parking using region proposal netw...,Get a text on your phone whenever a nearby par...,"[code, tutorial, python, pytorch, machine-lear...",Easy street parking using region proposal netw...
4,5,2020-02-20 06:29:18,Deep Learning based parking management system ...,Fastai provides easy to use wrappers to quickl...,"[code, tutorial, fastai, deep-learning, parkin...",Deep Learning based parking management system ...


In [None]:
# filtering
def filter(l, include=[], exclude=[]):
    """Filter a list using inclusion and exclusion lists of items."""
    filtered = [item for item in l if item in include and item not in exclude]
    return filtered

In [None]:
# Inclusion/exclusion criteria for tags
include = list(tags_dict.keys())
exclude = ['machine-learning', 'deep-learning',  'data-science',
           'neural-networks', 'python', 'r', 'visualization']

In [None]:
# Filter tags for each project
df.tags = df.tags.apply(filter, include=include, exclude=exclude)
tags = Counter(itertools.chain.from_iterable(df.tags.values))

In [None]:
@widgets.interact(min_tag_freq=(0, tags.most_common()[0][1]))
def separate_tags_by_freq(min_tag_freq=30):
    tags_above_freq = Counter(tag for tag in tags.elements()
                                    if tags[tag] >= min_tag_freq)
    tags_below_freq = Counter(tag for tag in tags.elements()
                                    if tags[tag] < min_tag_freq)
    print ("Most popular tags:\n", tags_above_freq.most_common(5))
    print ("\nTags that just made the cut:\n", tags_above_freq.most_common()[-5:])
    print ("\nTags that just missed the cut:\n", tags_below_freq.most_common(5))

interactive(children=(IntSlider(value=30, description='min_tag_freq', max=424), Output()), _dom_classes=('widg…

In [None]:
# Filter tags that have fewer than <min_tag_freq> occurances
min_tag_freq = 30
tags_above_freq = Counter(tag for tag in tags.elements()
                          if tags[tag] >= min_tag_freq)
df.tags = df.tags.apply(filter, include=list(tags_above_freq.keys()))

In [None]:
# Remove projects with no more remaining relevant tags
df = df[df.tags.map(len) > 0]
print (f"{len(df)} projects")

1444 projects


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [None]:
nltk.download('stopwords')
STOPWORDS = stopwords.words('english')
porter = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def preprocess(text, lower=True, stem=False, 
               filters="[!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~]", 
               stopwords=STOPWORDS):
    """Conditional preprocessing on our text unique to our task."""
    # Lower
    if lower: 
        text = text.lower()

    # Remove stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
    text = pattern.sub('', text)

    # Spacing and filters
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    text = re.sub(filters, r"", text)
    text = re.sub('[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric chars
    text = re.sub(' +', ' ', text)  # remove multiple spaces
    text = text.strip()

    # Remove links
    text = re.sub(r'http\S+', '', text)

    # Stemming
    if stem:
        text = " ".join([porter.stem(word) for word in text.split(' ')])

    return text

In [None]:
@widgets.interact(lower=True, stem=False)
def display_preprocessed_text(lower, stem):
    text = "Conditional image generation using Variational Autoencoders and GANs."
    preprocessed_text = preprocess(text=text, lower=lower, stem=stem)
    print (preprocessed_text)    

interactive(children=(Checkbox(value=True, description='lower'), Checkbox(value=False, description='stem'), Ou…

In [None]:
# Apply to dataframe
original_df = df.copy()
df.text = df.text.apply(preprocess, lower=True, stem=False)
print (f"{original_df.text.values[0]}\n{df.text.values[0]}")

Machine Learning Basics A practical set of notebooks on machine learning basics, implemented in both TF2.0 + Keras and PyTorch.
machine learning basics practical set notebooks machine learning basics implemented tf2 0 keras pytorch


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from wordcloud import WordCloud, STOPWORDS
sns.set_theme()
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import random

In [None]:
# Set seeds for reproducibility
seed = 42
np.random.seed(seed)
random.seed(seed)

In [None]:
# Shuffle
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
# Get data
X = df.text.to_numpy()
y = df.tags

In [None]:
class LabelEncoder(object):
    """Label encoder for tag labels."""
    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())

    def __len__(self):
        return len(self.class_to_index)

    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"

    def fit(self, y):
        classes = np.unique(list(itertools.chain.from_iterable(y)))
        for i, class_ in enumerate(classes):
            self.class_to_index[class_] = i
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        return self

    def encode(self, y):
        y_one_hot = np.zeros((len(y), len(self.class_to_index)), dtype=int)
        for i, item in enumerate(y):
            for class_ in item:
                y_one_hot[i][self.class_to_index[class_]] = 1
        return y_one_hot

    def decode(self, y):
        classes = []
        for i, item in enumerate(y):
            indices = np.where(item == 1)[0]
            classes.append([self.index_to_class[index] for index in indices])
        return classes

    def save(self, fp):
        with open(fp, 'w') as fp:
            contents = {'class_to_index': self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)

    @classmethod
    def load(cls, fp):
        with open(fp, 'r') as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

In [None]:
# Encode
label_encoder = LabelEncoder()
label_encoder.fit(y)
num_classes = len(label_encoder)

In [None]:
label_encoder.class_to_index

{'attention': 0,
 'autoencoders': 1,
 'computer-vision': 2,
 'convolutional-neural-networks': 3,
 'data-augmentation': 4,
 'embeddings': 5,
 'flask': 6,
 'generative-adversarial-networks': 7,
 'graph-neural-networks': 8,
 'graphs': 9,
 'huggingface': 10,
 'image-classification': 11,
 'interpretability': 12,
 'keras': 13,
 'language-modeling': 14,
 'natural-language-processing': 15,
 'node-classification': 16,
 'object-detection': 17,
 'pretraining': 18,
 'production': 19,
 'pytorch': 20,
 'question-answering': 21,
 'regression': 22,
 'reinforcement-learning': 23,
 'representation-learning': 24,
 'scikit-learn': 25,
 'segmentation': 26,
 'self-supervised-learning': 27,
 'tensorflow': 28,
 'tensorflow-js': 29,
 'time-series': 30,
 'transfer-learning': 31,
 'transformers': 32,
 'unsupervised-learning': 33,
 'wandb': 34}

In [None]:
# Sample
label_encoder.encode([["attention", "data-augmentation"]])

array([[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
# Encode all our labels
y = label_encoder.encode(y)
print (y.shape)

(1444, 35)


In [None]:
!pip install scikit-multilearn==0.2.0 -q



In [None]:
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix

In [None]:
# Split sizes
train_size = 0.7
val_size = 0.15
test_size = 0.15

In [None]:
# Split (train)
X_train, X_, y_train, y_ = train_test_split(X, y, train_size=train_size)

In [None]:
# Split (test)
X_val, X_test, y_val, y_test = train_test_split(
    X_, y_, train_size=0.5)

In [None]:
print(f"train: {len(X_train)} ({len(X_train)/len(X):.2f})\n"
      f"val: {len(X_val)} ({len(X_val)/len(X):.2f})\n"
      f"test: {len(X_test)} ({len(X_test)/len(X):.2f})")

train: 1010 (0.70)
val: 217 (0.15)
test: 217 (0.15)


In [None]:
# Get counts for each class
counts = {}
counts['train_counts'] = Counter(str(combination) for row in get_combination_wise_output_matrix(
    y_train, order=1) for combination in row)
counts['val_counts'] = Counter(str(combination) for row in get_combination_wise_output_matrix(
    y_val, order=1) for combination in row)
counts['test_counts'] = Counter(str(combination) for row in get_combination_wise_output_matrix(
    y_test, order=1) for combination in row)

counts

{'test_counts': Counter({'(0,)': 15,
          '(1,)': 8,
          '(10,)': 7,
          '(11,)': 8,
          '(12,)': 9,
          '(13,)': 12,
          '(14,)': 6,
          '(15,)': 66,
          '(16,)': 5,
          '(17,)': 11,
          '(18,)': 3,
          '(19,)': 5,
          '(2,)': 60,
          '(20,)': 41,
          '(21,)': 4,
          '(22,)': 2,
          '(23,)': 10,
          '(24,)': 10,
          '(25,)': 11,
          '(26,)': 10,
          '(27,)': 2,
          '(28,)': 26,
          '(29,)': 6,
          '(3,)': 19,
          '(30,)': 4,
          '(31,)': 5,
          '(32,)': 24,
          '(33,)': 4,
          '(34,)': 5,
          '(4,)': 3,
          '(5,)': 8,
          '(6,)': 7,
          '(7,)': 11,
          '(8,)': 11,
          '(9,)': 15}),
 'train_counts': Counter({'(0,)': 79,
          '(1,)': 28,
          '(10,)': 45,
          '(11,)': 32,
          '(12,)': 40,
          '(13,)': 66,
          '(14,)': 35,
          '(15,)': 289,
        

In [None]:
# View distributions
pd.DataFrame({
    "train": counts["train_counts"],
    "val": counts["val_counts"],
    "test": counts["test_counts"]
}).T.fillna(0)

Unnamed: 0,"(2,)","(26,)","(15,)","(5,)","(20,)","(32,)","(0,)","(28,)","(27,)","(17,)","(25,)","(1,)","(34,)","(3,)","(31,)","(11,)","(19,)","(10,)","(18,)","(13,)","(16,)","(9,)","(21,)","(6,)","(23,)","(33,)","(14,)","(29,)","(22,)","(30,)","(4,)","(24,)","(12,)","(8,)","(7,)"
train,271,33,289,58,184,134,79,158,30,46,43,28,28,69,32,32,40,45,24,66,24,54,23,20,39,29,35,30,42,25,30,41,40,35,49
val,57,5,69,9,33,38,26,29,8,12,6,5,6,18,9,11,6,12,3,15,4,9,5,7,10,6,10,4,5,5,8,6,6,5,13
test,60,10,66,8,41,24,15,26,2,11,11,8,5,19,5,8,5,7,3,12,5,15,4,7,10,4,6,6,2,4,3,10,9,11,11


In [None]:
# Adjust counts across splits
for k in counts["val_counts"].keys():
    counts["val_counts"][k] = int(counts["val_counts"][k] * \
        (train_size/val_size))
for k in counts["test_counts"].keys():
    counts["test_counts"][k] = int(counts["test_counts"][k] * \
        (train_size/test_size))

In [None]:
dist_df = pd.DataFrame({
    "train": counts["train_counts"],
    "val": counts["val_counts"],
    "test": counts["test_counts"]
}).T.fillna(0)
dist_df

Unnamed: 0,"(2,)","(26,)","(15,)","(5,)","(20,)","(32,)","(0,)","(28,)","(27,)","(17,)","(25,)","(1,)","(34,)","(3,)","(31,)","(11,)","(19,)","(10,)","(18,)","(13,)","(16,)","(9,)","(21,)","(6,)","(23,)","(33,)","(14,)","(29,)","(22,)","(30,)","(4,)","(24,)","(12,)","(8,)","(7,)"
train,271,33,289,58,184,134,79,158,30,46,43,28,28,69,32,32,40,45,24,66,24,54,23,20,39,29,35,30,42,25,30,41,40,35,49
val,266,23,322,42,154,177,121,135,37,56,28,23,28,84,42,51,28,56,14,70,18,42,23,32,46,28,46,18,23,23,37,28,28,23,60
test,280,46,308,37,191,112,70,121,9,51,51,37,23,88,23,37,23,32,14,56,23,70,18,32,46,18,28,28,9,18,14,46,42,51,51


In [None]:
# Standard deviation
np.mean(np.std(dist_df.to_numpy(), axis=0))

8.644273732900697

In [None]:
from skmultilearn.model_selection import IterativeStratification

In [None]:
def iterative_train_test_split(X, y, train_size):
    """Custom iterative train test split which
    'maintains balanced representation with respect
    to order-th label combinations.'
    """
    stratifier = IterativeStratification(
        n_splits=2, order=1, sample_distribution_per_fold=[1.0-train_size, train_size, ])
    train_indices, test_indices = next(stratifier.split(X, y))
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    return X_train, X_test, y_train, y_test

In [None]:
# Get data
X = df.text.to_numpy()
y = df.tags

In [None]:
# Binarize y
label_encoder = LabelEncoder()
label_encoder.fit(y)
y = label_encoder.encode(y)

In [None]:
# Split
X_train, X_, y_train, y_ = iterative_train_test_split(
    X, y, train_size=train_size)
X_val, X_test, y_val, y_test = iterative_train_test_split(
    X_, y_, train_size=0.5)

In [None]:
print(f"train: {len(X_train)} ({len(X_train)/len(X):.2f})\n"
      f"val: {len(X_val)} ({len(X_val)/len(X):.2f})\n"
      f"test: {len(X_test)} ({len(X_test)/len(X):.2f})")

train: 1023 (0.71)
val: 218 (0.15)
test: 203 (0.14)


In [None]:
# Get counts for each class
counts = {}
counts["train_counts"] = Counter(str(combination) for row in get_combination_wise_output_matrix(
    y_train, order=1) for combination in row)
counts["val_counts"] = Counter(str(combination) for row in get_combination_wise_output_matrix(
    y_val, order=1) for combination in row)
counts["test_counts"] = Counter(str(combination) for row in get_combination_wise_output_matrix(
    y_test, order=1) for combination in row)

In [None]:
# Adjust counts across splits
for k in counts["val_counts"].keys():
    counts["val_counts"][k] = int(counts["val_counts"][k] * \
        (train_size/val_size))
for k in counts["test_counts"].keys():
    counts["test_counts"][k] = int(counts["test_counts"][k] * \
        (train_size/test_size))

In [None]:
# View distributions
pd.DataFrame({
    "train": counts["train_counts"],
    "val": counts["val_counts"],
    "test": counts["test_counts"]
}).T.fillna(0)

Unnamed: 0,"(15,)","(7,)","(2,)","(1,)","(20,)","(4,)","(8,)","(9,)","(0,)","(32,)","(31,)","(5,)","(14,)","(13,)","(28,)","(27,)","(33,)","(19,)","(29,)","(3,)","(11,)","(17,)","(18,)","(25,)","(30,)","(23,)","(12,)","(10,)","(21,)","(34,)","(22,)","(26,)","(6,)","(24,)","(16,)"
train,297,51,272,29,181,29,36,55,84,142,31,52,42,65,149,28,27,36,28,74,37,51,26,42,24,41,38,49,27,28,34,34,24,42,24
val,298,46,270,37,177,28,46,56,60,121,60,56,23,56,149,28,28,32,28,74,28,51,9,42,23,42,42,37,14,28,32,28,28,23,9
test,294,56,270,18,182,28,23,51,107,130,9,51,18,74,149,28,28,37,28,74,37,32,9,42,23,42,37,32,9,23,37,37,18,46,32


In [None]:
dist_df = pd.DataFrame({
    "train": counts["train_counts"],
    "val": counts["val_counts"],
    "test": counts["test_counts"]
}).T.fillna(0)

In [None]:
# Standard deviation
np.mean(np.std(dist_df.to_numpy(), axis=0))

4.878424991639657

In [None]:
# Split DataFrames
train_df = pd.DataFrame({"text": X_train, "tags": label_encoder.decode(y_train)})
val_df = pd.DataFrame({"text": X_val, "tags": label_encoder.decode(y_val)})
test_df = pd.DataFrame({"text": X_test, "tags": label_encoder.decode(y_test)})

In [None]:
train_df.head()

Unnamed: 0,text,tags
0,medacy medical text mining information extract...,[natural-language-processing]
1,pytorch tutorial deep learning researchers rep...,"[autoencoders, computer-vision, generative-adv..."
2,deltapy tabular data augmentation feature engi...,[data-augmentation]
3,graph convolution structured documents convert...,"[computer-vision, graph-neural-networks, graphs]"
4,illustrated bert elmo co nlp cracked transfer ...,"[attention, embeddings, language-modeling, nat..."


In [None]:
!python -m pip install --upgrade pip
!pip install nlpaug==1.1.0 transformers==3.0.2 -q
!pip install snorkel==0.9.6 -q --use-feature=2020-resolver



In [None]:
import nlpaug.augmenter.word as naw

In [None]:
# Load tokenizer and transformers
substituion = naw.ContextualWordEmbsAug(model_path="distilbert-base-uncased", action="substitute")
insertion = naw.ContextualWordEmbsAug(model_path="distilbert-base-uncased", action="insert")

In [None]:
text = "Conditional image generation using Variational Autoencoders and GANs."

In [None]:
augmentated_text = substituion.augment(text)
print(augmentated_text)

supports binary encoding using variational encoding and gans.


In [None]:
# Insertions
augmentated_text = insertion.augment(text)
print(augmentated_text)

performs conditional sequential image generation algorithms using dynamic variational autoencoders and gans.


In [None]:
import inflect
from snorkel.augmentation import transformation_function
inflect = inflect.engine()

In [None]:
# Inflect
print (inflect.singular_noun("graphs"))
print (inflect.singular_noun("graph"))
print (inflect.plural_noun("graph"))
print (inflect.plural_noun("graphs"))

graph
False
graphs
graphss


In [None]:
def replace_dash(x):
    return x.replace("-", " ")

In [None]:
flat_tags_dict = {}
for tag, info in tags_dict.items():
    tag = tag.replace("-", " ")
    aliases = list(map(replace_dash, info["aliases"]))
    if len(aliases):
        flat_tags_dict[tag] = aliases
    for alias in aliases:
        _aliases = aliases + [tag]
        _aliases.remove(alias)
        flat_tags_dict[alias] = _aliases

In [None]:
# Tags that could be singular or plural
can_be_singular = [
    'animations',
    'cartoons',
    'autoencoders',
    'conditional random fields',
    'convolutional neural networks',
    'databases',
    'deep q networks',
    'gated recurrent units',
    'gaussian processes',
    'generative adversarial networks',
    'graph convolutional networks',
    'graph neural networks',
    'k nearest neighbors',
    'learning rates',
    'multilayer perceptrons',
    'outliers',
    'pos',
    'quasi recurrent neural networks',
    'recommendation systems',
    'recurrent neural networks',
    'streaming data',
    'data streams',
    'support vector machines',
    'variational autoencoders']
can_be_plural = [
    'annotation',
    'data annotation',
    'continuous integration',
    'continuous deployment',
    'crf',
    'conversational ai',
    'chatbot',
    'cnn',
    'db',
    'dqn',
    'expectation maximization',
    'fine tuning',
    'finetuning',
    'finetune',
    'gru',
    'gan',
    'gcn',
    'gnn',
    'hyperparameter optimization',
    'hyperparameter tuning',
    'image generation',
    'inference',
    'prediction',
    'knn',
    'knowledge base',
    'language modeling',
    'latent dirichlet allocation',
    'lstm',
    'machine translation',
    'model compression',
    'compression',
    'perceptron',
    'mlp',
    'optical character recognition',
    'outlier detection',
    'pos tagging',
    'pca',
    'qrnn',
    'rnn',
    'segmentation',
    'image segmentation',
    'spatial temporal cnn',
    'data streaming',
    'svm',
    'tabular',
    'temporal cnn',
    'tcnn',
    'vae',
    'vqa',
    'visualization',
    'data visualization']

In [None]:
# Add to flattened dict
for tag in can_be_singular:
    flat_tags_dict[inflect.singular_noun(tag)] = flat_tags_dict[tag]
for tag in can_be_plural:
    flat_tags_dict[inflect.plural_noun(tag)] = flat_tags_dict[tag]

In [None]:
print (flat_tags_dict["gan"])
print (flat_tags_dict["gans"])
print (flat_tags_dict["generative adversarial network"])
print (flat_tags_dict["generative adversarial networks"])

['generative adversarial networks']
['generative adversarial networks']
['gan']
['gan']


In [None]:
def find_word(word, text):
    word = word.replace("+", "\+")
    pattern = re.compile(fr"\b({word})\b", flags=re.IGNORECASE)
    return pattern.search(text)

In [None]:
@transformation_function()
def swap_aliases(x):
    """ Swap ML keywords with their aliases"""
    # Find all matches
    matches = []
    for i, tag in enumerate(flat_tags_dict):
        match = find_word(tag, x.text)
        if match:
            matches.append(match)
    
    # Swap a random match with a random alias
    if len(matches):
        match = random.choice(matches)
        tag = x.text[match.start():match.end()]
        x.text = f"{x.text[:match.start()]}{random.choice(flat_tags_dict[tag])}{x.text[match.end():]}"
    
    return x

In [None]:
# Swap
for i in range(3):
    sample_df = pd.DataFrame([{"text": "a survey of reinforcement learning for nlp tasks."}])
    sample_df.text = sample_df.text.apply(preprocess, lower=True, stem=False)
    print (swap_aliases(sample_df.iloc[0]).text)

survey reinforcement learning nlproc tasks
survey rl nlp tasks
survey rl nlp tasks


In [None]:
# Undesired behavior (needs contextual insight)
for i in range(3):
    sample_df = pd.DataFrame([{"text": "Autogenerate your CV to apply for jobs using NLP."}])
    sample_df.text = sample_df.text.apply(preprocess, lower=True, stem=False)
    print (swap_aliases(sample_df.iloc[0]).text)

autogenerate vision apply jobs using nlp
autogenerate cv apply jobs using natural language processing
autogenerate cv apply jobs using nlproc


In [None]:
from snorkel.augmentation import ApplyOnePolicy, PandasTFApplier

In [None]:
# Transformation function (TF) policy
policy = ApplyOnePolicy(n_per_original=5, keep_original=True)
tf_applier = PandasTFApplier([swap_aliases], policy)
train_df_augmented = tf_applier.apply(train_df)
train_df_augmented.drop_duplicates(subset=["text"], inplace=True)
train_df_augmented.head()

100%|██████████| 1023/1023 [00:15<00:00, 65.66it/s]


Unnamed: 0,text,tags
0,medacy medical text mining information extract...,[natural-language-processing]
1,pytorch tutorial deep learning researchers rep...,"[autoencoders, computer-vision, generative-adv..."
1,pytorch tutorial dl researchers repository pro...,"[autoencoders, computer-vision, generative-adv..."
2,deltapy tabular data augmentation feature engi...,[data-augmentation]
2,deltapy table augmentation feature engineering,[data-augmentation]


In [None]:
len(train_df), len(train_df_augmented)

(1023, 2037)

In [None]:
from sklearn.metrics import precision_recall_fscore_support
import torch

In [None]:
def set_seeds(seed=1234):
    """Set seed for reproducibility"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [None]:
def get_data_splits(df, train_size=0.7):
    X = df.text.to_numpy()
    y = df.tags

    # Binarize y
    label_encoder = LabelEncoder()
    label_encoder.fit(y)
    y = label_encoder.encode(y)

    # Split
    X_train, X_, y_train, y_ = iterative_train_test_split(X, y, train_size=train_size)

    X_val, X_test, y_val, y_test = iterative_train_test_split(X_, y_, train_size=0.5)

    return X_train, X_val, X_test, y_train, y_val, y_test, label_encoder

In [None]:
class Trainer(object):
    def __init__(self, model, device, loss_fn=None, optimizer=None, scheduler=None):

        # set params
        self.model = model
        self.device = device
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler

    def train_step(self, dataloader):
        """Train step"""
        self.model.train()
        loss = 0.0

        for i, batch in enumerate(dataloader):
            batch = [item.to(self.device) for item in batch]
            inputs, targets = batch[:-1], batch[-1]
            self.optimizer.zero_grad()
            z = self.model(inputs)
            J = self.loss_fn(z, targets)
            J.backward()
            self.optimizer.step()

            loss += (J.detach().item() - loss) / (i+1)

        return loss

    def eval_step(self, dataloader):
        """Validation step"""
        self.model.eval()
        loss = 0.0
        y_trues, y_probs = [], []

        with torch.no_grad():
            for i, batch in enumerate(dataloader):
                batch = [item.to(self.device) for item in batch]
                inputs, y_true = batch[:-1], batch[-1]

                z = self.model(inputs)
                J = self.loss_fn(z, y_true).item()

                loss += (J-loss) / (i+1)

                y_prob = torch.sigmoid(z).cpu().numpy()
                y_probs.extend(y_prob)
                y_trues.extend(y_true.cpu().numpy())

        return loss, np.vstack(y_trues), np.vstack(y_probs)

    def predict_step(self, dataloader):
        """prediction step"""
        self.model.eval()
        y_probs = []

        with torch.no_grad():
            for i, batch in enumerate(dataloader):
                inputs, targets = batch[:-1], batch[-1]
                y_prob = self.model(inputs)

                y_probs.extend(y_prob)

            return np.vstack(y_probs)

    def train(self, num_epochs, patience, train_dataloader, val_dataloader):
        best_val_loss = np.inf
        for epoch in range(num_epochs):
            train_loss = self.train_step(dataloader=train_dataloader)
            val_loss, _, _ = self.eval_step(dataloader=val_dataloader)

            self.scheduler.step(val_loss)

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = self.model
                _patience = patience

            else:
                _patience -= 1

            if not _patience:
                print("Stopping Early")
                break

            print(
                f"Epoch: {epoch+1} | "
                f"train_loss: {train_loss:.5f}, "
                f"val_loss: {val_loss:.5f}, "
                f"lr: {self.optimizer.param_groups[0]['lr']:.2E}, "
                f"_patience: {_patience}"
            )
            
        return best_model

In [None]:
set_seeds()

In [None]:
preprocessed_df = df.copy()
preprocessed_df.text = preprocessed_df.text.apply(preprocess, lower=True)
X_train, X_val, X_test, y_train, y_val, y_test, label_encoder = get_data_splits(preprocessed_df)

In [None]:
print(label_encoder)
print(label_encoder.classes)

<LabelEncoder(num_classes=35)>
['attention', 'autoencoders', 'computer-vision', 'convolutional-neural-networks', 'data-augmentation', 'embeddings', 'flask', 'generative-adversarial-networks', 'graph-neural-networks', 'graphs', 'huggingface', 'image-classification', 'interpretability', 'keras', 'language-modeling', 'natural-language-processing', 'node-classification', 'object-detection', 'pretraining', 'production', 'pytorch', 'question-answering', 'regression', 'reinforcement-learning', 'representation-learning', 'scikit-learn', 'segmentation', 'self-supervised-learning', 'tensorflow', 'tensorflow-js', 'time-series', 'transfer-learning', 'transformers', 'unsupervised-learning', 'wandb']


In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
set_seeds()

In [None]:
processed_df = df.copy()
processed_df.text = processed_df.text.apply(preprocess, lower=True)
X_train, X_val, X_test, y_train, y_val, y_test, label_encoder = get_data_splits(processed_df)
X_test_raw = X_test

In [None]:
# Split DataFrames
train_df = pd.DataFrame({"text": X_train, "tags": label_encoder.decode(y_train)})
val_df = pd.DataFrame({"text": X_val, "tags": label_encoder.decode(y_val)})
test_df = pd.DataFrame({"text": X_test, "tags": label_encoder.decode(y_test)})

In [None]:
# Set device
cuda = True
device = torch.device("cuda" if (
    torch.cuda.is_available() and cuda) else "cpu")
torch.set_default_tensor_type("torch.FloatTensor")
if device.type == "cuda":
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
print (device)

cuda


In [None]:
class Tokenizer(object):
    def __init__(self, char_level, num_tokens=None, pad_token="<PAD", 
                 oov_token="<UNK>", token_to_index=None):
        self.char_level = char_level
        self.separator = '' if self.char_level else ' '
        if num_tokens: num_tokens -= 2
        self.num_tokens = num_tokens
        self.pad_token = pad_token
        self.oov_token = oov_token
        if not token_to_index:
            token_to_index = {pad_token: 0, oov_token: 1}
        self.token_to_index = token_to_index
        self.index_to_token = {v:k for k, v in self.token_to_index.items()}

    def __len__(self):
        return len(self.token_to_index)
    
    def __str__(self):
        return f"<Tokenizer(num_tokens={len(self)})>"

    def fit_on_texts(self, texts):
        if not self.char_level:
            texts = [text.split(' ') for text in texts]
        all_tokens = [token for text in texts for token in text]
        counts = Counter(all_tokens).most_common(self.num_tokens)
        self.min_token_freq = counts[-1][1]
        for token, count in counts:
            index = len(self)
            self.token_to_index[token] = index
            self.index_to_token[index] = token
        return self

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            if not self.char_level:
                text = text.split(" ")
            sequence = []
            for token in text:
                sequence.append(self.token_to_index.get(
                    token, self.token_to_index[self.oov_token]
                ))
            sequences.append(np.array(sequence))

        return sequences

    def sequences_to_texts(self, sequences):
        texts = []
        for sequence in sequences:
            text =  []
            for index in sequence:
                text.append(self.index_to_token.get(
                    index, self.oov_token
                ))
            texts.append(self.separator.join([token for token in text]))
        return texts

    def save(self, fp):
        with open(fp, 'w') as fp:
            contents = {
                "char_level": self.char_level,
                "oov_token": self.oov_token,
                "token_to_index": self.token_to_index
            }
            json.dump(contents, fp, indent=4, sort_keys=False)

    @classmethod
    def load(cls, fp):
        with open(fp, "r") as  fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)


In [None]:
# tokenize
char_level = True
tokenizer = Tokenizer(char_level=char_level)
tokenizer.fit_on_texts(texts=X_train)

<__main__.Tokenizer at 0x7fad0150ae90>

In [None]:
vocab_size = len(tokenizer)

print(tokenizer)

<Tokenizer(num_tokens=39)>


In [None]:
# Convert texts to sequences of indices
X_train = np.array(tokenizer.texts_to_sequences(X_train))
X_val = np.array(tokenizer.texts_to_sequences(X_val))
X_test = np.array(tokenizer.texts_to_sequences(X_test))
preprocessed_text = tokenizer.sequences_to_texts([X_train[0]])[0]
print ("Text to indices:\n"
    f"  (preprocessed) → {preprocessed_text}\n"
    f"  (tokenized) → {X_train[0]}")

Text to indices:
  (preprocessed) → medacy medical text mining information extraction spacy
  (tokenized) → [16  3 14  7 12 21  2 16  3 14  4 12  7 11  2  6  3 25  6  2 16  4  5  4
  5 15  2  4  5 19 10  8 16  7  6  4 10  5  2  3 25  6  8  7 12  6  4 10
  5  2  9 13  7 12 21]


In [None]:
all_tags = list(itertools.chain.from_iterable(df.tags.values))

In [None]:
# Class weights
counts = np.bincount([label_encoder.class_to_index[class_] for class_ in all_tags])
class_weights = {i: 1.0/count for i, count in enumerate(counts)}
print (f"class counts: {counts},\nclass weights: {class_weights}")

class counts: [120  41 388 106  41  75  34  73  51  78  64  51  55  93  51 424  33  69
  30  51 258  32  49  59  57  60  48  40 213  40  34  46 196  39  39],
class weights: {0: 0.008333333333333333, 1: 0.024390243902439025, 2: 0.002577319587628866, 3: 0.009433962264150943, 4: 0.024390243902439025, 5: 0.013333333333333334, 6: 0.029411764705882353, 7: 0.0136986301369863, 8: 0.0196078431372549, 9: 0.01282051282051282, 10: 0.015625, 11: 0.0196078431372549, 12: 0.01818181818181818, 13: 0.010752688172043012, 14: 0.0196078431372549, 15: 0.0023584905660377358, 16: 0.030303030303030304, 17: 0.014492753623188406, 18: 0.03333333333333333, 19: 0.0196078431372549, 20: 0.003875968992248062, 21: 0.03125, 22: 0.02040816326530612, 23: 0.01694915254237288, 24: 0.017543859649122806, 25: 0.016666666666666666, 26: 0.020833333333333332, 27: 0.025, 28: 0.004694835680751174, 29: 0.025, 30: 0.029411764705882353, 31: 0.021739130434782608, 32: 0.00510204081632653, 33: 0.02564102564102564, 34: 0.02564102564102564

In [None]:
from torch.nn.utils.rnn import pad_sequence

In [None]:
def pad_sequences(sequences, max_seq_len=0):
    """Pad sequences to max length in sequence."""
    max_seq_len = max(max_seq_len, max(len(sequence) for sequence in sequences))
    padded_sequences = np.zeros((len(sequences), max_seq_len))
    for i, sequence in enumerate(sequences):
        padded_sequences[i][:len(sequence)] = sequence
    return padded_sequences

In [None]:
class CNNTextDataset(torch.utils.data.Dataset):
    def __init__(self, X, y, max_filter_size):
        self.X = X
        self.y = y
        self.max_filter_size = max_filter_size

    def __len__(self):
        return len(self.y)

    def __str__(self):
        return f"<Dataset(N={len(self)})>"

    def __getitem__(self, index):
        X = self.X[index]
        y = self.y[index]
        return [X, y]

    def collate_fn(self, batch):
        batch = np.array(batch, dtype=object)
        X = batch[:, 0]
        y = np.stack(batch[:, 1], axis=0)

        # pad inputs
        X = pad_sequences(sequences=X, max_seq_len=self.max_filter_size)

        X = torch.LongTensor(X.astype(np.int32))
        y = torch.FloatTensor(y.astype(np.int32))

        return X, y

    def create_dataloader(self, batch_size, shuffle=False, drop_last=False):
        return torch.utils.data.DataLoader(
            dataset=self,
            batch_size=batch_size,
            collate_fn=self.collate_fn,
            shuffle=shuffle,
            drop_last=drop_last,
            pin_memory=True)

In [None]:
# Create datasets
filter_sizes = list(range(1, 11))
train_dataset = CNNTextDataset(
    X=X_train, y=y_train, max_filter_size=max(filter_sizes))
val_dataset = CNNTextDataset(
    X=X_val, y=y_val, max_filter_size=max(filter_sizes))
test_dataset = CNNTextDataset(
    X=X_test, y=y_test, max_filter_size=max(filter_sizes))
print ("Data splits:\n"
    f"  Train dataset:{train_dataset.__str__()}\n"
    f"  Val dataset: {val_dataset.__str__()}\n"
    f"  Test dataset: {test_dataset.__str__()}\n"
    "Sample point:\n"
    f"  X: {train_dataset[0][0]}\n"
    f"  y: {train_dataset[0][1]}")

Data splits:
  Train dataset:<Dataset(N=1023)>
  Val dataset: <Dataset(N=205)>
  Test dataset: <Dataset(N=216)>
Sample point:
  X: [16  3 14  7 12 21  2 16  3 14  4 12  7 11  2  6  3 25  6  2 16  4  5  4
  5 15  2  4  5 19 10  8 16  7  6  4 10  5  2  3 25  6  8  7 12  6  4 10
  5  2  9 13  7 12 21]
  y: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [None]:
# Create dataloaders
batch_size = 64
train_dataloader = train_dataset.create_dataloader(
    batch_size=batch_size)
val_dataloader = val_dataset.create_dataloader(
    batch_size=batch_size)
test_dataloader = test_dataset.create_dataloader(
    batch_size=batch_size)
batch_X, batch_y = next(iter(train_dataloader))
print ("Sample batch:\n"
    f"  X: {list(batch_X.size())}\n"
    f"  y: {list(batch_y.size())}")

Sample batch:
  X: [64, 185]
  y: [64, 35]


In [None]:
embedding_dim = 128
num_filters = 128
hidden_dim = 128
dropout_p = 0.5

In [None]:
class CNN(nn.Module):
    def __init__(self, embedding_dim, vocab_size, num_filters, filter_sizes, 
                 hidden_dim, dropout_p, num_classes, padding_idx=0):
        super(CNN, self).__init__()
        self.embeddings = nn.Embedding(
            embedding_dim=embedding_dim, num_embeddings=vocab_size,
            padding_idx=padding_idx
        )

        self.filter_sizes = filter_sizes
        self.conv = nn.ModuleList(
            [nn.Conv1d(in_channels=embedding_dim,
                       out_channels=num_filters,
                       kernel_size=f) for f in filter_sizes])
        self.dropout = nn.Dropout(dropout_p)
        self.fc1 = nn.Linear(num_filters*len(filter_sizes), hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, inputs, channel_first=False):
        x_in, = inputs
        x_in = self.embeddings(x_in)
        if not channel_first:
            x_in = x_in.transpose(1, 2) # (N, channels, seq_length)
        z = []
        max_seq_len = x_in.shape[2]
        for i, f in enumerate(self.filter_sizes):
            # SAME padding
            padding_left = int(
                (self.conv[i].stride[0]*(max_seq_len-1) - max_seq_len + self.filter_sizes[i])/2)
            padding_right = int(math.ceil(
                (self.conv[i].stride[0]*(max_seq_len-1) - max_seq_len + self.filter_sizes[i])/2))
            
            _z = self.conv[i](F.pad(x_in, (padding_left, padding_right)))

            _z = F.max_pool1d(_z, _z.size(2)).squeeze(2)
            z.append(_z)

        # concat
        z = torch.cat(z, 1)

        # FC
        z = self.fc1(z)
        z = self.dropout(z)
        z = self.fc2(z)

        return z

In [None]:
model = CNN(
    embedding_dim=embedding_dim, vocab_size=vocab_size,
    num_filters=num_filters, filter_sizes=filter_sizes,
    hidden_dim=hidden_dim, dropout_p=dropout_p,
    num_classes=num_classes
)
model = model.to(device)
print(model.named_parameters)

<bound method Module.named_parameters of CNN(
  (embeddings): Embedding(39, 128, padding_idx=0)
  (conv): ModuleList(
    (0): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
    (1): Conv1d(128, 128, kernel_size=(2,), stride=(1,))
    (2): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
    (3): Conv1d(128, 128, kernel_size=(4,), stride=(1,))
    (4): Conv1d(128, 128, kernel_size=(5,), stride=(1,))
    (5): Conv1d(128, 128, kernel_size=(6,), stride=(1,))
    (6): Conv1d(128, 128, kernel_size=(7,), stride=(1,))
    (7): Conv1d(128, 128, kernel_size=(8,), stride=(1,))
    (8): Conv1d(128, 128, kernel_size=(9,), stride=(1,))
    (9): Conv1d(128, 128, kernel_size=(10,), stride=(1,))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=1280, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=35, bias=True)
)>


In [None]:
# Arguments
lr = 2e-4
num_epochs = 200
patience = 10


In [None]:
# Define loss
class_weights_tensor = torch.Tensor(np.array(list(class_weights.values())))
loss_fn = nn.BCEWithLogitsLoss(weight=class_weights_tensor)

In [None]:
# Define optimizer & scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.1, patience=5)

In [None]:
# Trainer module
trainer = Trainer(
    model=model, device=device, loss_fn=loss_fn,
    optimizer=optimizer, scheduler=scheduler)

In [None]:
from pathlib import Path
from sklearn.metrics import precision_recall_curve

In [None]:
# Determining the best threshold
def find_best_threshold(y_true, y_prob):
    """Find the best threshold for maximum F1."""
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_prob)
    f1s = (2 * precisions * recalls) / (precisions + recalls)
    return thresholds[np.argmax(f1s)]

In [None]:
!pip install mlflow



In [None]:
from argparse import Namespace
import mlflow
from pathlib import Path

In [None]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-5.1.0.tar.gz (745 kB)
[?25l[K     |▍                               | 10 kB 32.5 MB/s eta 0:00:01[K     |▉                               | 20 kB 17.2 MB/s eta 0:00:01[K     |█▎                              | 30 kB 10.5 MB/s eta 0:00:01[K     |█▊                              | 40 kB 8.5 MB/s eta 0:00:01[K     |██▏                             | 51 kB 5.0 MB/s eta 0:00:01[K     |██▋                             | 61 kB 5.5 MB/s eta 0:00:01[K     |███                             | 71 kB 5.3 MB/s eta 0:00:01[K     |███▌                            | 81 kB 6.0 MB/s eta 0:00:01[K     |████                            | 92 kB 4.7 MB/s eta 0:00:01[K     |████▍                           | 102 kB 5.0 MB/s eta 0:00:01[K     |████▉                           | 112 kB 5.0 MB/s eta 0:00:01[K     |█████▎                          | 122 kB 5.0 MB/s eta 0:00:01[K     |█████▊                          | 133 kB 5.0 MB/s eta 0:00:01[K     |████

In [None]:
# Specify arguments
args = Namespace(
    char_level=True,
    filter_sizes=list(range(1, 11)),
    batch_size=128,
    embedding_dim=128, 
    num_filters=128,
    hidden_dim=128, 
    dropout_p=0.5,
    lr=2e-4,
    num_epochs=200,
    patience=10,
)

In [None]:
# Set tracking URI
MODEL_REGISTRY = Path("experiments")
Path(MODEL_REGISTRY).mkdir(exist_ok=True) # create experiments dir
mlflow.set_tracking_uri("file://" + str(MODEL_REGISTRY.absolute()))

In [None]:
!ls

experiments  sample_data


In [None]:
# Trainer (modified for experiment tracking)
class Trainer(object):
    def __init__(self, model, device, loss_fn=None, 
                 optimizer=None, scheduler=None):

        # Set params
        self.model = model
        self.device = device
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler

    def train_step(self, dataloader):
        """Train step."""
        # Set model to train mode
        self.model.train()
        loss = 0.0

        # Iterate over train batches
        for i, batch in enumerate(dataloader):
            # Step
            batch = [item.to(self.device) for item in batch]
            inputs, targets = batch[:-1], batch[-1]
            self.optimizer.zero_grad()  # Reset gradients
            z = self.model(inputs)  # Forward pass
            J = self.loss_fn(z, targets)  # Define loss
            J.backward()  # Backward pass
            self.optimizer.step()  # Update weights

            # Cumulative Metrics
            loss += (J.detach().item() - loss) / (i + 1)

        return loss

    def eval_step(self, dataloader):
        """Validation or test step."""
        # Set model to eval mode
        self.model.eval()
        loss = 0.0
        y_trues, y_probs = [], []

        # Iterate over val batches
        with torch.no_grad():
            for i, batch in enumerate(dataloader):

                # Step
                batch = [item.to(self.device) for item in batch]  # Set device
                inputs, y_true = batch[:-1], batch[-1]
                z = self.model(inputs)  # Forward pass
                J = self.loss_fn(z, y_true).item()

                # Cumulative Metrics
                loss += (J - loss) / (i + 1)

                # Store outputs
                y_prob = torch.sigmoid(z).cpu().numpy()
                y_probs.extend(y_prob)
                y_trues.extend(y_true.cpu().numpy())

        return loss, np.vstack(y_trues), np.vstack(y_probs)

    def predict_step(self, dataloader):
        """Prediction step."""
        # Set model to eval mode
        self.model.eval()
        y_probs = []

        # Iterate over val batches
        with torch.no_grad():
            for i, batch in enumerate(dataloader):

                # Forward pass w/ inputs
                inputs, targets = batch[:-1], batch[-1]
                y_prob = self.model(inputs)

                # Store outputs
                y_probs.extend(y_prob)

        return np.vstack(y_probs)
    
    def train(self, num_epochs, patience, train_dataloader, val_dataloader):
        best_val_loss = np.inf
        for epoch in range(num_epochs):
            # Steps
            train_loss = self.train_step(dataloader=train_dataloader)
            val_loss, _, _ = self.eval_step(dataloader=val_dataloader)
            self.scheduler.step(val_loss)

            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = self.model
                _patience = patience  # reset _patience
            else:
                _patience -= 1
            if not _patience:  # 0
                print("Stopping early!")
                break

            # Tracking
            mlflow.log_metrics(
                {"train_loss": train_loss, "val_loss": val_loss}, step=epoch
            )

            # Logging
            print(
                f"Epoch: {epoch+1} | "
                f"train_loss: {train_loss:.5f}, "
                f"val_loss: {val_loss:.5f}, "
                f"lr: {self.optimizer.param_groups[0]['lr']:.2E}, "
                f"_patience: {_patience}"
            )

        return best_model, best_val_loss

In [None]:
def train_cnn(args, df):
    """Train a CNN using specific arguments."""

    # Set seeds
    set_seeds()

    # Get data splits
    preprocessed_df = df.copy()
    preprocessed_df.text = preprocessed_df.text.apply(preprocess, lower=True)
    X_train, X_val, X_test, y_train, y_val, y_test, label_encoder = get_data_splits(preprocessed_df)
    num_classes = len(label_encoder)

    # Set device
    cuda = True
    device = torch.device("cuda" if (
        torch.cuda.is_available() and cuda) else "cpu")
    torch.set_default_tensor_type("torch.FloatTensor")
    if device.type == "cuda":
        torch.set_default_tensor_type("torch.cuda.FloatTensor")

    # Tokenize
    tokenizer = Tokenizer(char_level=args.char_level)
    tokenizer.fit_on_texts(texts=X_train)
    vocab_size = len(tokenizer)

    # Convert texts to sequences of indices
    X_train = np.array(tokenizer.texts_to_sequences(X_train))
    X_val = np.array(tokenizer.texts_to_sequences(X_val))
    X_test = np.array(tokenizer.texts_to_sequences(X_test))

    # Class weights
    counts = np.bincount([label_encoder.class_to_index[class_] for class_ in all_tags])
    class_weights = {i: 1.0/count for i, count in enumerate(counts)}

    # Create datasets
    train_dataset = CNNTextDataset(
        X=X_train, y=y_train, max_filter_size=max(args.filter_sizes))
    val_dataset = CNNTextDataset(
        X=X_val, y=y_val, max_filter_size=max(args.filter_sizes))
    test_dataset = CNNTextDataset(
        X=X_test, y=y_test, max_filter_size=max(args.filter_sizes))

    # Create dataloaders
    train_dataloader = train_dataset.create_dataloader(
        batch_size=args.batch_size)
    val_dataloader = val_dataset.create_dataloader(
        batch_size=args.batch_size)
    test_dataloader = test_dataset.create_dataloader(
        batch_size=args.batch_size)

    # Initialize model
    model = CNN(
        embedding_dim=args.embedding_dim, vocab_size=vocab_size,
        num_filters=args.num_filters, filter_sizes=args.filter_sizes,
        hidden_dim=args.hidden_dim, dropout_p=args.dropout_p,
        num_classes=num_classes)
    model = model.to(device)

    # Define loss
    class_weights_tensor = torch.Tensor(np.array(list(class_weights.values())))
    loss_fn = nn.BCEWithLogitsLoss(weight=class_weights_tensor)

    # Define optimizer & scheduler
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode="min", factor=0.1, patience=5)

    # Trainer module
    trainer = Trainer(
        model=model, device=device, loss_fn=loss_fn, 
        optimizer=optimizer, scheduler=scheduler)

    # Train
    best_model, best_val_loss = trainer.train(
        args.num_epochs, args.patience, train_dataloader, val_dataloader)

    # Best threshold for f1
    train_loss, y_true, y_prob = trainer.eval_step(dataloader=train_dataloader)
    precisions, recalls, thresholds = precision_recall_curve(y_true.ravel(), y_prob.ravel())
    threshold = find_best_threshold(y_true.ravel(), y_prob.ravel())

    # Determine predictions using threshold
    test_loss, y_true, y_prob = trainer.eval_step(dataloader=test_dataloader)
    y_pred = np.array([np.where(prob >= threshold, 1, 0) for prob in y_prob])

    # Evaluate (simple)
    metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
    performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}

    return {
        "args": args,
        "tokenizer": tokenizer,
        "label_encoder": label_encoder,
        "model": best_model,
        "performance": performance,
        "best_val_loss": best_val_loss,
    }

In [None]:
import tempfile

In [None]:
# Set experiment
mlflow.set_experiment(experiment_name="baselines")

INFO: 'baselines' does not exist. Creating a new experiment


In [None]:
def save_dict(d, filepath):
    """Save dict to a json file."""
    with open(filepath, "w") as fp:
        json.dump(d, indent=2, sort_keys=False, fp=fp)

In [None]:
# Tracking
with mlflow.start_run(run_name="cnn") as run:

    # Train & evaluate
    artifacts = train_cnn(args=args, df=df)    
    
    # Log key metrics
    mlflow.log_metrics({"precision": artifacts["performance"]["precision"]})
    mlflow.log_metrics({"recall": artifacts["performance"]["recall"]})
    mlflow.log_metrics({"f1": artifacts["performance"]["f1"]})

    # Log artifacts
    with tempfile.TemporaryDirectory() as dp:
        artifacts["tokenizer"].save(Path(dp, "tokenizer.json"))
        artifacts["label_encoder"].save(Path(dp, "label_encoder.json"))
        torch.save(artifacts["model"].state_dict(), Path(dp, "model.pt"))
        save_dict(artifacts["performance"], Path(dp, "performance.json"))
        mlflow.log_artifacts(dp)

    # Log parameters
    mlflow.log_params(vars(artifacts["args"]))

Epoch: 1 | train_loss: 0.00680, val_loss: 0.00303, lr: 2.00E-04, _patience: 10
Epoch: 2 | train_loss: 0.00393, val_loss: 0.00327, lr: 2.00E-04, _patience: 9
Epoch: 3 | train_loss: 0.00403, val_loss: 0.00329, lr: 2.00E-04, _patience: 8
Epoch: 4 | train_loss: 0.00373, val_loss: 0.00299, lr: 2.00E-04, _patience: 10
Epoch: 5 | train_loss: 0.00348, val_loss: 0.00283, lr: 2.00E-04, _patience: 10
Epoch: 6 | train_loss: 0.00332, val_loss: 0.00280, lr: 2.00E-04, _patience: 10
Epoch: 7 | train_loss: 0.00322, val_loss: 0.00277, lr: 2.00E-04, _patience: 10
Epoch: 8 | train_loss: 0.00318, val_loss: 0.00276, lr: 2.00E-04, _patience: 10
Epoch: 9 | train_loss: 0.00306, val_loss: 0.00272, lr: 2.00E-04, _patience: 10
Epoch: 10 | train_loss: 0.00298, val_loss: 0.00268, lr: 2.00E-04, _patience: 10
Epoch: 11 | train_loss: 0.00290, val_loss: 0.00265, lr: 2.00E-04, _patience: 10
Epoch: 12 | train_loss: 0.00282, val_loss: 0.00262, lr: 2.00E-04, _patience: 10
Epoch: 13 | train_loss: 0.00277, val_loss: 0.00257,

In [None]:
from pyngrok import ngrok

In [None]:
# https://stackoverflow.com/questions/61615818/setting-up-mlflow-on-google-colab
get_ipython().system_raw("mlflow server -h 0.0.0.0 -p 5000 --backend-store-uri $PWD/experiments/ &")
ngrok.kill()
ngrok.set_auth_token("")
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)

MLflow Tracking UI: https://c69d-34-133-105-254.ngrok.io


In [None]:
def load_dict(filepath):
    """Load a dict from a json file."""
    with open(filepath, "r") as fp:
        d = json.load(fp)
    return d

In [None]:
# Load all runs from experiment
experiment_id = mlflow.get_experiment_by_name("baselines").experiment_id
all_runs = mlflow.search_runs(experiment_ids=experiment_id, order_by=["metrics.best_val_loss ASC"])
print (all_runs)

                             run_id  ... tags.mlflow.runName
0  1fe720a77cb14964b10c83bb0fdc5498  ...                 cnn
1  f5353ce0c8ff405d88e0159a9b92a6f4  ...                 cnn

[2 rows x 25 columns]


In [None]:
device = torch.device("cpu")
best_run_id = all_runs.iloc[0].run_id
best_run = mlflow.get_run(run_id=best_run_id)
client = mlflow.tracking.MlflowClient()

with tempfile.TemporaryDirectory() as dp:
    client.download_artifacts(run_id=best_run_id, path="", dst_path=dp)
    tokenizer = Tokenizer.load(fp=Path(dp, "tokenizer.json"))
    label_encoder = LabelEncoder.load(fp=Path(dp, "label_encoder.json"))
    model_state = torch.load(Path(dp, "model.pt"), map_location=device)
    performance = load_dict(filepath=Path(dp, "performance.json"))



In [None]:
print (json.dumps(performance, indent=2))

{
  "precision": 0.7454592687466495,
  "recall": 0.5875831485587583,
  "f1": 0.6362426249188632
}


In [None]:
# load artifacts
device = torch.device("cpu")
model = CNN(
    embedding_dim=args.embedding_dim, vocab_size=len(tokenizer),
    num_filters=args.num_filters, filter_sizes=args.filter_sizes,
    hidden_dim=args.hidden_dim, dropout_p=args.dropout_p,
    num_classes=len(label_encoder)
)
model.load_state_dict(model_state)
model.to(device)


CNN(
  (embeddings): Embedding(39, 128, padding_idx=0)
  (conv): ModuleList(
    (0): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
    (1): Conv1d(128, 128, kernel_size=(2,), stride=(1,))
    (2): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
    (3): Conv1d(128, 128, kernel_size=(4,), stride=(1,))
    (4): Conv1d(128, 128, kernel_size=(5,), stride=(1,))
    (5): Conv1d(128, 128, kernel_size=(6,), stride=(1,))
    (6): Conv1d(128, 128, kernel_size=(7,), stride=(1,))
    (7): Conv1d(128, 128, kernel_size=(8,), stride=(1,))
    (8): Conv1d(128, 128, kernel_size=(9,), stride=(1,))
    (9): Conv1d(128, 128, kernel_size=(10,), stride=(1,))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=1280, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=35, bias=True)
)

In [None]:
trainer = Trainer(model=model, device=device)

In [None]:
# Dataloader
text = "Transfer learning with BERT for self-supervised learning"
X = np.array(tokenizer.texts_to_sequences([preprocess(text)]))
y_filler = label_encoder.encode([np.array([label_encoder.classes[0]]*len(X))])
dataset = CNNTextDataset(
    X=X, y=y_filler, max_filter_size=max(filter_sizes))
dataloader = dataset.create_dataloader(
    batch_size=batch_size)

In [None]:
# manual threshold
threshold = 0.29

In [None]:
# Inference
y_prob = trainer.predict_step(dataloader)
y_pred = np.array([np.where(prob >= threshold, 1, 0) for prob in y_prob])
label_encoder.decode(y_pred)

[['natural-language-processing',
  'self-supervised-learning',
  'transfer-learning',
  'transformers']]

## Optimzation with Optuna

In [None]:
!pip install optuna==2.4.0 numpyencoder==0.3.0

Collecting optuna==2.4.0
  Downloading optuna-2.4.0-py3-none-any.whl (282 kB)
[K     |████████████████████████████████| 282 kB 5.0 MB/s 
[?25hCollecting numpyencoder==0.3.0
  Downloading numpyencoder-0.3.0-py3-none-any.whl (3.0 kB)
Collecting cliff
  Downloading cliff-3.9.0-py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 12.0 MB/s 
Collecting colorlog
  Downloading colorlog-6.4.1-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.6.0
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting pbr!=2.1.0,>=2.0.0
  Downloading pbr-5.6.0-py2.py3-none-any.whl (111 kB)
[K     |████████████████████████████████| 111 kB 72.2 MB/s 
[?25hCollecting cmd2>=1.0.0
  Downloading cmd2-2.1.2-py3-none-any.whl (141 kB)
[K     |████████████████████████████████| 141 kB 60.8 MB/s 
Collecting stevedore>=2.0.1
  Downloading stevedore-3.4.0-py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 8.1 MB/s 
[?25hCollecting autopage>=0.4.0
  Downloading autopage-0

In [None]:
import optuna

In [None]:
from argparse import Namespace

In [None]:
# arguments
args = Namespace(
    char_level=True,
    filter_sizes=list(range(1,11)),
    batch_size=64,
    embedding_dim=128,
    num_filters=128,
    hidden_dim=128,
    dropout_p=0.5,
    lr=2e-4,
    num_epochs=100,
    patience=10
)

In [None]:
class Trainer(object):
    def __init__(self, model, device, loss_fn=None, optimizer=None, 
                 scheduler=None, trial=None):
        self.model = model
        self.device = device
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.trial = trial

    def train_step(self, dataloader):
        self.model.train()
        loss = 0.0

        for i, batch in enumerate(dataloader):
            batch = [item.to(self.device) for item in batch]
            inputs, targets = batch[:-1], batch[-1]
            self.optimizer.zero_grad()
            z = self.model(inputs)
            J = self.loss_fn(z, targets)
            J.backward()
            self.optimizer.step()

            loss += (J.detach().item() - loss) / (i+1)

        return loss

    def eval_step(self, dataloader):
        self.model.eval()
        loss = 0.0
        y_trues, y_probs = [], []

        # Iterate over val batches
        with torch.no_grad():
            for i, batch in enumerate(dataloader):

                # Step
                batch = [item.to(self.device) for item in batch]  # Set device
                inputs, y_true = batch[:-1], batch[-1]
                z = self.model(inputs)  # Forward pass
                J = self.loss_fn(z, y_true).item()

                # Cumulative Metrics
                loss += (J - loss) / (i + 1)

                # Store outputs
                y_prob = torch.sigmoid(z).cpu().numpy()
                y_probs.extend(y_prob)
                y_trues.extend(y_true.cpu().numpy())

        return loss, np.vstack(y_trues), np.vstack(y_probs)

    def predict_step(self, dataloader):
        """Prediction step."""
        # Set model to eval mode
        self.model.eval()
        y_probs = []

        # Iterate over val batches
        with torch.no_grad():
            for i, batch in enumerate(dataloader):

                # Forward pass w/ inputs
                inputs, targets = batch[:-1], batch[-1]
                z = self.model(inputs)

                # Store outputs
                y_prob = torch.sigmoid(z).cpu().numpy()
                y_probs.extend(y_prob)

        return np.vstack(y_probs)

    def train(self, num_epochs, patience, train_dataloader, val_dataloader, 
              tolerance=1e-5):
        best_val_loss = np.inf

        for epoch in range(num_epochs):
            train_loss = self.train_step(dataloader=train_dataloader)
            val_loss, _, _ = self.eval_step(dataloader=val_dataloader)
            self.scheduler.step(val_loss)

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = self.model
                _patience = patience
            else:
                _patience -= 1

            if not _patience:
                print("Stopping Early!")
                break

            print(
                f"Epoch: {epoch+1} | "
                f"train_loss: {train_loss:.5f}, "
                f"val_loss: {val_loss:.5f}, "
                f"lr: {self.optimizer.param_groups[0]['lr']:.2E}, "
                f"_patience: {_patience}"
            )

            # Pruning based on intermediate value
            self.trial.report(val_loss, epoch)
            if self.trial.should_prune():
                raise optuna.TrialPruned()

        return best_model, best_val_loss

In [None]:
def train_cnn(args, df, trial=None):
    set_seeds()

    preprocessed_df = df.copy()
    preprocessed_df.text = preprocessed_df.text.apply(preprocess, lower=True)
    X_train, X_val, X_test, y_train, y_val, y_test, label_encoder = get_data_splits(preprocessed_df)
    num_classes = len(label_encoder)

    cuda = True
    device = torch.device("cuda" if (
        torch.cuda.is_available() and cuda) else "cpu")
    torch.set_default_tensor_type("torch.FloatTensor")
    if device.type == "cuda":
        torch.set_default_tensor_type("torch.cuda.FloatTensor")

     # Tokenize
    tokenizer = Tokenizer(char_level=args.char_level)
    tokenizer.fit_on_texts(texts=X_train)
    vocab_size = len(tokenizer)

    # Convert texts to sequences of indices
    X_train = np.array(tokenizer.texts_to_sequences(X_train))
    X_val = np.array(tokenizer.texts_to_sequences(X_val))
    X_test = np.array(tokenizer.texts_to_sequences(X_test))

    # Class weights
    train_tags = list(itertools.chain.from_iterable(train_df.tags.values))
    counts = np.bincount([label_encoder.class_to_index[class_] for class_ in train_tags])
    class_weights = {i: 1.0/count for i, count in enumerate(counts)}

    # Create datasets
    train_dataset = CNNTextDataset(
        X=X_train, y=y_train, max_filter_size=max(args.filter_sizes))
    val_dataset = CNNTextDataset(
        X=X_val, y=y_val, max_filter_size=max(args.filter_sizes))
    test_dataset = CNNTextDataset(
        X=X_test, y=y_test, max_filter_size=max(args.filter_sizes))
    
    # Create dataloaders
    train_dataloader = train_dataset.create_dataloader(
        batch_size=args.batch_size)
    val_dataloader = val_dataset.create_dataloader(
        batch_size=args.batch_size)
    test_dataloader = test_dataset.create_dataloader(
        batch_size=args.batch_size)

    # Initialize model
    model = CNN(
        embedding_dim=args.embedding_dim, vocab_size=vocab_size,
        num_filters=args.num_filters, filter_sizes=args.filter_sizes,
        hidden_dim=args.hidden_dim, dropout_p=args.dropout_p,
        num_classes=num_classes)
    model = model.to(device)

    # Define loss
    class_weights_tensor = torch.Tensor(np.array(list(class_weights.values())))
    loss_fn = nn.BCEWithLogitsLoss(weight=class_weights_tensor)

    # Define optimizer & scheduler
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.1, patience=5)
    
    # Trainer module
    trainer = Trainer(
        model=model, device=device, loss_fn=loss_fn, 
        optimizer=optimizer, scheduler=scheduler, trial=trial)
    
    # Train
    best_model, best_val_loss = trainer.train(
        args.num_epochs, args.patience, train_dataloader, val_dataloader)
    
    # Best threshold for f1
    train_loss, y_true, y_prob = trainer.eval_step(dataloader=train_dataloader)
    precisions, recalls, thresholds = precision_recall_curve(y_true.ravel(), y_prob.ravel())
    threshold = find_best_threshold(y_true.ravel(), y_prob.ravel())

    # Determine predictions using threshold
    test_loss, y_true, y_prob = trainer.eval_step(dataloader=test_dataloader)
    y_pred = np.array([np.where(prob >= threshold, 1, 0) for prob in y_prob])

    # Evaluate (simple)
    metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
    performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}

    return {
        "args": args,
        "tokenizer": tokenizer,
        "label_encoder": label_encoder,
        "model": best_model,
        "performance": performance,
        "best_val_loss": best_val_loss,
        "threshold": threshold,
    }

In [None]:
def objective(trial, args):
    """ Consume a trial and set of arguments and produce the metric to optimize"""

    # params to tune
    args.embedding_dim = trial.suggest_int("embedding_dim", 128, 512)
    args.num_filters = trial.suggest_int("num_filters", 128, 512)
    args.hidden_dim = trial.suggest_int("hidden_dim", 128, 512)
    args.dropout_p = trial.suggest_uniform("dropout_p", 0.3, 0.8)
    args.lr = trial.suggest_loguniform("lr", 5e-5, 5e-4)

    # train and evaluate
    artifacts = train_cnn(args=args, df=df, trial=trial)

    # additional attributes
    trial.set_user_attr("precision", artifacts["performance"]["precision"])
    trial.set_user_attr("recall", artifacts["performance"]["recall"])
    trial.set_user_attr("f1", artifacts["performance"]["f1"])
    trial.set_user_attr("threshold", artifacts["threshold"])

    return artifacts["performance"]["f1"]

In [None]:
from numpyencoder import NumpyEncoder
from optuna.integration.mlflow import MLflowCallback

In [None]:
NUM_TRIALS = 50

In [None]:
# optimize
pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)
study = optuna.create_study(study_name="optimization_1", direction="maximize", pruner=pruner)
mlflow_callback = MLflowCallback(
    tracking_uri=mlflow.get_tracking_uri(), metric_name="f1"
)
study.optimize(
    lambda trial: objective(trial, args),
    n_trials=NUM_TRIALS,
    callbacks=[mlflow_callback]
)

[32m[I 2021-09-02 01:17:28,571][0m A new study created in memory with name: optimization_1[0m


Epoch: 1 | train_loss: 0.00891, val_loss: 0.00436, lr: 7.24E-05, _patience: 10
Epoch: 2 | train_loss: 0.00507, val_loss: 0.00456, lr: 7.24E-05, _patience: 9
Epoch: 3 | train_loss: 0.00494, val_loss: 0.00430, lr: 7.24E-05, _patience: 10
Epoch: 4 | train_loss: 0.00463, val_loss: 0.00414, lr: 7.24E-05, _patience: 10
Epoch: 5 | train_loss: 0.00449, val_loss: 0.00408, lr: 7.24E-05, _patience: 10
Epoch: 6 | train_loss: 0.00438, val_loss: 0.00405, lr: 7.24E-05, _patience: 10
Epoch: 7 | train_loss: 0.00427, val_loss: 0.00400, lr: 7.24E-05, _patience: 10
Epoch: 8 | train_loss: 0.00415, val_loss: 0.00394, lr: 7.24E-05, _patience: 10
Epoch: 9 | train_loss: 0.00405, val_loss: 0.00387, lr: 7.24E-05, _patience: 10
Epoch: 10 | train_loss: 0.00393, val_loss: 0.00381, lr: 7.24E-05, _patience: 10
Epoch: 11 | train_loss: 0.00385, val_loss: 0.00375, lr: 7.24E-05, _patience: 10
Epoch: 12 | train_loss: 0.00374, val_loss: 0.00367, lr: 7.24E-05, _patience: 10
Epoch: 13 | train_loss: 0.00363, val_loss: 0.00359

[32m[I 2021-09-02 01:19:13,294][0m Trial 0 finished with value: 0.6609949866000874 and parameters: {'embedding_dim': 216, 'num_filters': 180, 'hidden_dim': 256, 'dropout_p': 0.5789252763161006, 'lr': 7.239503094923748e-05}. Best is trial 0 with value: 0.6609949866000874.[0m


INFO: 'optimization_1' does not exist. Creating a new experiment
Epoch: 1 | train_loss: 0.00761, val_loss: 0.00582, lr: 1.94E-04, _patience: 10
Epoch: 2 | train_loss: 0.00501, val_loss: 0.00405, lr: 1.94E-04, _patience: 10
Epoch: 3 | train_loss: 0.00413, val_loss: 0.00388, lr: 1.94E-04, _patience: 10
Epoch: 4 | train_loss: 0.00369, val_loss: 0.00357, lr: 1.94E-04, _patience: 10
Epoch: 5 | train_loss: 0.00334, val_loss: 0.00326, lr: 1.94E-04, _patience: 10
Epoch: 6 | train_loss: 0.00298, val_loss: 0.00299, lr: 1.94E-04, _patience: 10
Epoch: 7 | train_loss: 0.00264, val_loss: 0.00280, lr: 1.94E-04, _patience: 10
Epoch: 8 | train_loss: 0.00237, val_loss: 0.00267, lr: 1.94E-04, _patience: 10
Epoch: 9 | train_loss: 0.00216, val_loss: 0.00255, lr: 1.94E-04, _patience: 10
Epoch: 10 | train_loss: 0.00195, val_loss: 0.00247, lr: 1.94E-04, _patience: 10
Epoch: 11 | train_loss: 0.00181, val_loss: 0.00241, lr: 1.94E-04, _patience: 10
Epoch: 12 | train_loss: 0.00162, val_loss: 0.00243, lr: 1.94E-04

[32m[I 2021-09-02 01:20:30,698][0m Trial 1 finished with value: 0.6878502065377774 and parameters: {'embedding_dim': 182, 'num_filters': 486, 'hidden_dim': 496, 'dropout_p': 0.4667952286923533, 'lr': 0.0001935715676509405}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00856, val_loss: 0.00450, lr: 8.74E-05, _patience: 10
Epoch: 2 | train_loss: 0.00639, val_loss: 0.00435, lr: 8.74E-05, _patience: 10
Epoch: 3 | train_loss: 0.00569, val_loss: 0.00408, lr: 8.74E-05, _patience: 10
Epoch: 4 | train_loss: 0.00530, val_loss: 0.00397, lr: 8.74E-05, _patience: 10
Epoch: 5 | train_loss: 0.00490, val_loss: 0.00388, lr: 8.74E-05, _patience: 10
Epoch: 6 | train_loss: 0.00467, val_loss: 0.00378, lr: 8.74E-05, _patience: 10
Epoch: 7 | train_loss: 0.00444, val_loss: 0.00365, lr: 8.74E-05, _patience: 10
Epoch: 8 | train_loss: 0.00415, val_loss: 0.00359, lr: 8.74E-05, _patience: 10
Epoch: 9 | train_loss: 0.00396, val_loss: 0.00347, lr: 8.74E-05, _patience: 10
Epoch: 10 | train_loss: 0.00384, val_loss: 0.00340, lr: 8.74E-05, _patience: 10
Epoch: 11 | train_loss: 0.00359, val_loss: 0.00330, lr: 8.74E-05, _patience: 10
Epoch: 12 | train_loss: 0.00342, val_loss: 0.00318, lr: 8.74E-05, _patience: 10
Epoch: 13 | train_loss: 0.00330, val_loss: 0.0030

[32m[I 2021-09-02 01:23:20,020][0m Trial 2 finished with value: 0.6545345523397196 and parameters: {'embedding_dim': 417, 'num_filters': 267, 'hidden_dim': 142, 'dropout_p': 0.6175157402027225, 'lr': 8.744178370658122e-05}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00701, val_loss: 0.00526, lr: 1.52E-04, _patience: 10
Epoch: 2 | train_loss: 0.00509, val_loss: 0.00417, lr: 1.52E-04, _patience: 10
Epoch: 3 | train_loss: 0.00449, val_loss: 0.00399, lr: 1.52E-04, _patience: 10
Epoch: 4 | train_loss: 0.00422, val_loss: 0.00387, lr: 1.52E-04, _patience: 10
Epoch: 5 | train_loss: 0.00394, val_loss: 0.00370, lr: 1.52E-04, _patience: 10
Epoch: 6 | train_loss: 0.00376, val_loss: 0.00352, lr: 1.52E-04, _patience: 10
Epoch: 7 | train_loss: 0.00351, val_loss: 0.00336, lr: 1.52E-04, _patience: 10
Epoch: 8 | train_loss: 0.00325, val_loss: 0.00317, lr: 1.52E-04, _patience: 10
Epoch: 9 | train_loss: 0.00301, val_loss: 0.00298, lr: 1.52E-04, _patience: 10
Epoch: 10 | train_loss: 0.00281, val_loss: 0.00285, lr: 1.52E-04, _patience: 10
Epoch: 11 | train_loss: 0.00266, val_loss: 0.00276, lr: 1.52E-04, _patience: 10
Epoch: 12 | train_loss: 0.00250, val_loss: 0.00265, lr: 1.52E-04, _patience: 10
Epoch: 13 | train_loss: 0.00232, val_loss: 0.0025

[32m[I 2021-09-02 01:24:24,184][0m Trial 3 finished with value: 0.6641709919290605 and parameters: {'embedding_dim': 336, 'num_filters': 149, 'hidden_dim': 304, 'dropout_p': 0.6186526418309053, 'lr': 0.00015162361532599776}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00922, val_loss: 0.00559, lr: 1.81E-04, _patience: 10
Epoch: 2 | train_loss: 0.00657, val_loss: 0.00409, lr: 1.81E-04, _patience: 10
Epoch: 3 | train_loss: 0.00534, val_loss: 0.00401, lr: 1.81E-04, _patience: 10
Epoch: 4 | train_loss: 0.00481, val_loss: 0.00383, lr: 1.81E-04, _patience: 10
Epoch: 5 | train_loss: 0.00450, val_loss: 0.00370, lr: 1.81E-04, _patience: 10
Epoch: 6 | train_loss: 0.00417, val_loss: 0.00352, lr: 1.81E-04, _patience: 10
Epoch: 7 | train_loss: 0.00390, val_loss: 0.00335, lr: 1.81E-04, _patience: 10
Epoch: 8 | train_loss: 0.00363, val_loss: 0.00322, lr: 1.81E-04, _patience: 10
Epoch: 9 | train_loss: 0.00343, val_loss: 0.00302, lr: 1.81E-04, _patience: 10
Epoch: 10 | train_loss: 0.00319, val_loss: 0.00284, lr: 1.81E-04, _patience: 10
Epoch: 11 | train_loss: 0.00305, val_loss: 0.00275, lr: 1.81E-04, _patience: 10
Epoch: 12 | train_loss: 0.00297, val_loss: 0.00271, lr: 1.81E-04, _patience: 10
Epoch: 13 | train_loss: 0.00278, val_loss: 0.0026

[32m[I 2021-09-02 01:26:11,152][0m Trial 4 finished with value: 0.6556943228637905 and parameters: {'embedding_dim': 293, 'num_filters': 360, 'hidden_dim': 290, 'dropout_p': 0.7788319993004884, 'lr': 0.00018090028216966177}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.01202, val_loss: 0.00454, lr: 2.44E-04, _patience: 10
Epoch: 2 | train_loss: 0.00821, val_loss: 0.00406, lr: 2.44E-04, _patience: 10
Epoch: 3 | train_loss: 0.00667, val_loss: 0.00387, lr: 2.44E-04, _patience: 10
Epoch: 4 | train_loss: 0.00602, val_loss: 0.00370, lr: 2.44E-04, _patience: 10
Epoch: 5 | train_loss: 0.00550, val_loss: 0.00361, lr: 2.44E-04, _patience: 10
Epoch: 6 | train_loss: 0.00494, val_loss: 0.00346, lr: 2.44E-04, _patience: 10
Epoch: 7 | train_loss: 0.00455, val_loss: 0.00335, lr: 2.44E-04, _patience: 10
Epoch: 8 | train_loss: 0.00433, val_loss: 0.00312, lr: 2.44E-04, _patience: 10
Epoch: 9 | train_loss: 0.00411, val_loss: 0.00304, lr: 2.44E-04, _patience: 10
Epoch: 10 | train_loss: 0.00389, val_loss: 0.00305, lr: 2.44E-04, _patience: 9
Epoch: 11 | train_loss: 0.00372, val_loss: 0.00290, lr: 2.44E-04, _patience: 10
Epoch: 12 | train_loss: 0.00353, val_loss: 0.00285, lr: 2.44E-04, _patience: 10
Epoch: 13 | train_loss: 0.00330, val_loss: 0.00275

[32m[I 2021-09-02 01:29:02,515][0m Trial 5 finished with value: 0.6304044130747363 and parameters: {'embedding_dim': 380, 'num_filters': 451, 'hidden_dim': 138, 'dropout_p': 0.7919175725364053, 'lr': 0.00024408053444791993}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00695, val_loss: 0.00569, lr: 1.15E-04, _patience: 10
Epoch: 2 | train_loss: 0.00526, val_loss: 0.00408, lr: 1.15E-04, _patience: 10
Epoch: 3 | train_loss: 0.00455, val_loss: 0.00389, lr: 1.15E-04, _patience: 10
Epoch: 4 | train_loss: 0.00415, val_loss: 0.00374, lr: 1.15E-04, _patience: 10
Epoch: 5 | train_loss: 0.00384, val_loss: 0.00353, lr: 1.15E-04, _patience: 10
Epoch: 6 | train_loss: 0.00353, val_loss: 0.00338, lr: 1.15E-04, _patience: 10
Epoch: 7 | train_loss: 0.00328, val_loss: 0.00318, lr: 1.15E-04, _patience: 10
Epoch: 8 | train_loss: 0.00304, val_loss: 0.00303, lr: 1.15E-04, _patience: 10
Epoch: 9 | train_loss: 0.00279, val_loss: 0.00289, lr: 1.15E-04, _patience: 10
Epoch: 10 | train_loss: 0.00265, val_loss: 0.00275, lr: 1.15E-04, _patience: 10
Epoch: 11 | train_loss: 0.00247, val_loss: 0.00264, lr: 1.15E-04, _patience: 10
Epoch: 12 | train_loss: 0.00230, val_loss: 0.00256, lr: 1.15E-04, _patience: 10
Epoch: 13 | train_loss: 0.00222, val_loss: 0.0025

[32m[I 2021-09-02 01:31:12,915][0m Trial 6 finished with value: 0.6760696455945018 and parameters: {'embedding_dim': 278, 'num_filters': 442, 'hidden_dim': 239, 'dropout_p': 0.49059175382969394, 'lr': 0.0001147282071154538}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00647, val_loss: 0.00578, lr: 1.13E-04, _patience: 10
Epoch: 2 | train_loss: 0.00477, val_loss: 0.00413, lr: 1.13E-04, _patience: 10
Epoch: 3 | train_loss: 0.00420, val_loss: 0.00403, lr: 1.13E-04, _patience: 10
Epoch: 4 | train_loss: 0.00406, val_loss: 0.00396, lr: 1.13E-04, _patience: 10
Epoch: 5 | train_loss: 0.00386, val_loss: 0.00381, lr: 1.13E-04, _patience: 10
Epoch: 6 | train_loss: 0.00367, val_loss: 0.00366, lr: 1.13E-04, _patience: 10
Epoch: 7 | train_loss: 0.00341, val_loss: 0.00349, lr: 1.13E-04, _patience: 10
Epoch: 8 | train_loss: 0.00324, val_loss: 0.00329, lr: 1.13E-04, _patience: 10
Epoch: 9 | train_loss: 0.00303, val_loss: 0.00314, lr: 1.13E-04, _patience: 10
Epoch: 10 | train_loss: 0.00283, val_loss: 0.00297, lr: 1.13E-04, _patience: 10
Epoch: 11 | train_loss: 0.00265, val_loss: 0.00287, lr: 1.13E-04, _patience: 10
Epoch: 12 | train_loss: 0.00250, val_loss: 0.00276, lr: 1.13E-04, _patience: 10
Epoch: 13 | train_loss: 0.00234, val_loss: 0.0026

[32m[I 2021-09-02 01:32:41,009][0m Trial 7 finished with value: 0.6812641884892998 and parameters: {'embedding_dim': 143, 'num_filters': 435, 'hidden_dim': 402, 'dropout_p': 0.4249064577125997, 'lr': 0.00011307774367830222}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.01147, val_loss: 0.00466, lr: 3.23E-04, _patience: 10
Epoch: 2 | train_loss: 0.00788, val_loss: 0.00420, lr: 3.23E-04, _patience: 10
Epoch: 3 | train_loss: 0.00636, val_loss: 0.00402, lr: 3.23E-04, _patience: 10
Epoch: 4 | train_loss: 0.00559, val_loss: 0.00388, lr: 3.23E-04, _patience: 10
Epoch: 5 | train_loss: 0.00513, val_loss: 0.00375, lr: 3.23E-04, _patience: 10
Epoch: 6 | train_loss: 0.00479, val_loss: 0.00361, lr: 3.23E-04, _patience: 10
Epoch: 7 | train_loss: 0.00443, val_loss: 0.00350, lr: 3.23E-04, _patience: 10
Epoch: 8 | train_loss: 0.00429, val_loss: 0.00336, lr: 3.23E-04, _patience: 10
Epoch: 9 | train_loss: 0.00406, val_loss: 0.00323, lr: 3.23E-04, _patience: 10
Epoch: 10 | train_loss: 0.00394, val_loss: 0.00313, lr: 3.23E-04, _patience: 10
Epoch: 11 | train_loss: 0.00365, val_loss: 0.00323, lr: 3.23E-04, _patience: 9
Epoch: 12 | train_loss: 0.00365, val_loss: 0.00295, lr: 3.23E-04, _patience: 10
Epoch: 13 | train_loss: 0.00344, val_loss: 0.00293

[32m[I 2021-09-02 01:33:39,275][0m Trial 8 finished with value: 0.6037287009247577 and parameters: {'embedding_dim': 264, 'num_filters': 167, 'hidden_dim': 129, 'dropout_p': 0.7925746802485985, 'lr': 0.0003234720460926899}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00843, val_loss: 0.00463, lr: 7.90E-05, _patience: 10
Epoch: 2 | train_loss: 0.00545, val_loss: 0.00476, lr: 7.90E-05, _patience: 9
Epoch: 3 | train_loss: 0.00498, val_loss: 0.00427, lr: 7.90E-05, _patience: 10
Epoch: 4 | train_loss: 0.00472, val_loss: 0.00415, lr: 7.90E-05, _patience: 10
Epoch: 5 | train_loss: 0.00462, val_loss: 0.00414, lr: 7.90E-05, _patience: 10
Epoch: 6 | train_loss: 0.00448, val_loss: 0.00409, lr: 7.90E-05, _patience: 10
Epoch: 7 | train_loss: 0.00435, val_loss: 0.00405, lr: 7.90E-05, _patience: 10
Epoch: 8 | train_loss: 0.00423, val_loss: 0.00400, lr: 7.90E-05, _patience: 10
Epoch: 9 | train_loss: 0.00416, val_loss: 0.00396, lr: 7.90E-05, _patience: 10
Epoch: 10 | train_loss: 0.00407, val_loss: 0.00390, lr: 7.90E-05, _patience: 10
Epoch: 11 | train_loss: 0.00394, val_loss: 0.00386, lr: 7.90E-05, _patience: 10
Epoch: 12 | train_loss: 0.00384, val_loss: 0.00380, lr: 7.90E-05, _patience: 10
Epoch: 13 | train_loss: 0.00371, val_loss: 0.00374

[32m[I 2021-09-02 01:34:57,306][0m Trial 9 finished with value: 0.6577800506578666 and parameters: {'embedding_dim': 161, 'num_filters': 213, 'hidden_dim': 394, 'dropout_p': 0.6869216587736107, 'lr': 7.904835653843723e-05}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00936, val_loss: 0.00552, lr: 3.96E-04, _patience: 10
Epoch: 2 | train_loss: 0.00510, val_loss: 0.00425, lr: 3.96E-04, _patience: 10
Epoch: 3 | train_loss: 0.00382, val_loss: 0.00349, lr: 3.96E-04, _patience: 10
Epoch: 4 | train_loss: 0.00305, val_loss: 0.00295, lr: 3.96E-04, _patience: 10
Epoch: 5 | train_loss: 0.00250, val_loss: 0.00263, lr: 3.96E-04, _patience: 10
Epoch: 6 | train_loss: 0.00208, val_loss: 0.00247, lr: 3.96E-04, _patience: 10
Epoch: 7 | train_loss: 0.00173, val_loss: 0.00239, lr: 3.96E-04, _patience: 10
Epoch: 8 | train_loss: 0.00144, val_loss: 0.00227, lr: 3.96E-04, _patience: 10
Epoch: 9 | train_loss: 0.00117, val_loss: 0.00221, lr: 3.96E-04, _patience: 10
Epoch: 10 | train_loss: 0.00096, val_loss: 0.00226, lr: 3.96E-04, _patience: 9
Epoch: 11 | train_loss: 0.00080, val_loss: 0.00223, lr: 3.96E-04, _patience: 8
Epoch: 12 | train_loss: 0.00066, val_loss: 0.00228, lr: 3.96E-04, _patience: 7
Epoch: 13 | train_loss: 0.00054, val_loss: 0.00239, 

[32m[I 2021-09-02 01:35:53,502][0m Trial 10 finished with value: 0.6789307249717715 and parameters: {'embedding_dim': 196, 'num_filters': 508, 'hidden_dim': 482, 'dropout_p': 0.3458682136775321, 'lr': 0.00039586259922156513}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00673, val_loss: 0.00571, lr: 1.59E-04, _patience: 10
Epoch: 2 | train_loss: 0.00490, val_loss: 0.00417, lr: 1.59E-04, _patience: 10
Epoch: 3 | train_loss: 0.00419, val_loss: 0.00406, lr: 1.59E-04, _patience: 10
Epoch: 4 | train_loss: 0.00390, val_loss: 0.00387, lr: 1.59E-04, _patience: 10
Epoch: 5 | train_loss: 0.00368, val_loss: 0.00374, lr: 1.59E-04, _patience: 10
Epoch: 6 | train_loss: 0.00345, val_loss: 0.00349, lr: 1.59E-04, _patience: 10
Epoch: 7 | train_loss: 0.00319, val_loss: 0.00331, lr: 1.59E-04, _patience: 10
Epoch: 8 | train_loss: 0.00291, val_loss: 0.00308, lr: 1.59E-04, _patience: 10
Epoch: 9 | train_loss: 0.00265, val_loss: 0.00290, lr: 1.59E-04, _patience: 10
Epoch: 10 | train_loss: 0.00243, val_loss: 0.00274, lr: 1.59E-04, _patience: 10
Epoch: 11 | train_loss: 0.00222, val_loss: 0.00262, lr: 1.59E-04, _patience: 10
Epoch: 12 | train_loss: 0.00207, val_loss: 0.00252, lr: 1.59E-04, _patience: 10
Epoch: 13 | train_loss: 0.00194, val_loss: 0.0024

[32m[I 2021-09-02 01:37:04,981][0m Trial 11 finished with value: 0.6694897828389194 and parameters: {'embedding_dim': 129, 'num_filters': 388, 'hidden_dim': 511, 'dropout_p': 0.42575842372533995, 'lr': 0.00015922214044810807}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00752, val_loss: 0.00571, lr: 2.21E-04, _patience: 10
Epoch: 2 | train_loss: 0.00488, val_loss: 0.00409, lr: 2.21E-04, _patience: 10
Epoch: 3 | train_loss: 0.00403, val_loss: 0.00389, lr: 2.21E-04, _patience: 10
Epoch: 4 | train_loss: 0.00367, val_loss: 0.00362, lr: 2.21E-04, _patience: 10
Epoch: 5 | train_loss: 0.00330, val_loss: 0.00332, lr: 2.21E-04, _patience: 10
Epoch: 6 | train_loss: 0.00292, val_loss: 0.00305, lr: 2.21E-04, _patience: 10
Epoch: 7 | train_loss: 0.00262, val_loss: 0.00282, lr: 2.21E-04, _patience: 10
Epoch: 8 | train_loss: 0.00232, val_loss: 0.00265, lr: 2.21E-04, _patience: 10
Epoch: 9 | train_loss: 0.00205, val_loss: 0.00252, lr: 2.21E-04, _patience: 10
Epoch: 10 | train_loss: 0.00187, val_loss: 0.00248, lr: 2.21E-04, _patience: 10
Epoch: 11 | train_loss: 0.00170, val_loss: 0.00247, lr: 2.21E-04, _patience: 10
Epoch: 12 | train_loss: 0.00150, val_loss: 0.00234, lr: 2.21E-04, _patience: 10
Epoch: 13 | train_loss: 0.00133, val_loss: 0.0022

[32m[I 2021-09-02 01:38:08,336][0m Trial 12 finished with value: 0.6740707036488466 and parameters: {'embedding_dim': 128, 'num_filters': 512, 'hidden_dim': 421, 'dropout_p': 0.3450977488645123, 'lr': 0.00022095061201785128}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00648, val_loss: 0.00580, lr: 1.07E-04, _patience: 10
Epoch: 2 | train_loss: 0.00474, val_loss: 0.00401, lr: 1.07E-04, _patience: 10
Epoch: 3 | train_loss: 0.00411, val_loss: 0.00394, lr: 1.07E-04, _patience: 10
Epoch: 4 | train_loss: 0.00387, val_loss: 0.00377, lr: 1.07E-04, _patience: 10
Epoch: 5 | train_loss: 0.00359, val_loss: 0.00358, lr: 1.07E-04, _patience: 10
Epoch: 6 | train_loss: 0.00337, val_loss: 0.00337, lr: 1.07E-04, _patience: 10
Epoch: 7 | train_loss: 0.00315, val_loss: 0.00315, lr: 1.07E-04, _patience: 10
Epoch: 8 | train_loss: 0.00288, val_loss: 0.00299, lr: 1.07E-04, _patience: 10
Epoch: 9 | train_loss: 0.00268, val_loss: 0.00283, lr: 1.07E-04, _patience: 10
Epoch: 10 | train_loss: 0.00249, val_loss: 0.00272, lr: 1.07E-04, _patience: 10
Epoch: 11 | train_loss: 0.00235, val_loss: 0.00261, lr: 1.07E-04, _patience: 10
Epoch: 12 | train_loss: 0.00214, val_loss: 0.00252, lr: 1.07E-04, _patience: 10
Epoch: 13 | train_loss: 0.00200, val_loss: 0.0024

[32m[I 2021-09-02 01:39:41,049][0m Trial 13 finished with value: 0.6700895444744346 and parameters: {'embedding_dim': 219, 'num_filters': 449, 'hidden_dim': 411, 'dropout_p': 0.4422439623256509, 'lr': 0.00010682946560523402}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00664, val_loss: 0.00554, lr: 1.23E-04, _patience: 10
Epoch: 2 | train_loss: 0.00452, val_loss: 0.00393, lr: 1.23E-04, _patience: 10
Epoch: 3 | train_loss: 0.00391, val_loss: 0.00369, lr: 1.23E-04, _patience: 10
Epoch: 4 | train_loss: 0.00338, val_loss: 0.00333, lr: 1.23E-04, _patience: 10
Epoch: 5 | train_loss: 0.00304, val_loss: 0.00304, lr: 1.23E-04, _patience: 10
Epoch: 6 | train_loss: 0.00268, val_loss: 0.00283, lr: 1.23E-04, _patience: 10
Epoch: 7 | train_loss: 0.00241, val_loss: 0.00264, lr: 1.23E-04, _patience: 10
Epoch: 8 | train_loss: 0.00219, val_loss: 0.00258, lr: 1.23E-04, _patience: 10
Epoch: 9 | train_loss: 0.00198, val_loss: 0.00245, lr: 1.23E-04, _patience: 10
Epoch: 10 | train_loss: 0.00181, val_loss: 0.00242, lr: 1.23E-04, _patience: 10
Epoch: 11 | train_loss: 0.00164, val_loss: 0.00240, lr: 1.23E-04, _patience: 10
Epoch: 12 | train_loss: 0.00150, val_loss: 0.00229, lr: 1.23E-04, _patience: 10
Epoch: 13 | train_loss: 0.00132, val_loss: 0.0022

[32m[I 2021-09-02 01:41:24,721][0m Trial 14 finished with value: 0.6695801319370777 and parameters: {'embedding_dim': 497, 'num_filters': 396, 'hidden_dim': 458, 'dropout_p': 0.4748623756212068, 'lr': 0.0001234013454462735}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00696, val_loss: 0.00472, lr: 5.95E-05, _patience: 10
Epoch: 2 | train_loss: 0.00459, val_loss: 0.00462, lr: 5.95E-05, _patience: 10
Epoch: 3 | train_loss: 0.00421, val_loss: 0.00416, lr: 5.95E-05, _patience: 10
Epoch: 4 | train_loss: 0.00402, val_loss: 0.00408, lr: 5.95E-05, _patience: 10
Epoch: 5 | train_loss: 0.00393, val_loss: 0.00403, lr: 5.95E-05, _patience: 10
Epoch: 6 | train_loss: 0.00383, val_loss: 0.00398, lr: 5.95E-05, _patience: 10
Epoch: 7 | train_loss: 0.00370, val_loss: 0.00390, lr: 5.95E-05, _patience: 10
Epoch: 8 | train_loss: 0.00362, val_loss: 0.00383, lr: 5.95E-05, _patience: 10
Epoch: 9 | train_loss: 0.00350, val_loss: 0.00374, lr: 5.95E-05, _patience: 10
Epoch: 10 | train_loss: 0.00338, val_loss: 0.00366, lr: 5.95E-05, _patience: 10
Epoch: 11 | train_loss: 0.00326, val_loss: 0.00356, lr: 5.95E-05, _patience: 10
Epoch: 12 | train_loss: 0.00319, val_loss: 0.00346, lr: 5.95E-05, _patience: 10
Epoch: 13 | train_loss: 0.00306, val_loss: 0.0033

[32m[I 2021-09-02 01:43:35,221][0m Trial 15 finished with value: 0.6493208822681001 and parameters: {'embedding_dim': 169, 'num_filters': 313, 'hidden_dim': 360, 'dropout_p': 0.3045297423130306, 'lr': 5.951900802779886e-05}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00840, val_loss: 0.00581, lr: 2.68E-04, _patience: 10
Epoch: 2 | train_loss: 0.00538, val_loss: 0.00427, lr: 2.68E-04, _patience: 10
Epoch: 3 | train_loss: 0.00417, val_loss: 0.00388, lr: 2.68E-04, _patience: 10
Epoch: 4 | train_loss: 0.00362, val_loss: 0.00356, lr: 2.68E-04, _patience: 10
Epoch: 5 | train_loss: 0.00319, val_loss: 0.00315, lr: 2.68E-04, _patience: 10
Epoch: 6 | train_loss: 0.00276, val_loss: 0.00287, lr: 2.68E-04, _patience: 10
Epoch: 7 | train_loss: 0.00243, val_loss: 0.00269, lr: 2.68E-04, _patience: 10
Epoch: 8 | train_loss: 0.00213, val_loss: 0.00256, lr: 2.68E-04, _patience: 10
Epoch: 9 | train_loss: 0.00191, val_loss: 0.00246, lr: 2.68E-04, _patience: 10
Epoch: 10 | train_loss: 0.00170, val_loss: 0.00239, lr: 2.68E-04, _patience: 10
Epoch: 11 | train_loss: 0.00150, val_loss: 0.00236, lr: 2.68E-04, _patience: 10
Epoch: 12 | train_loss: 0.00131, val_loss: 0.00238, lr: 2.68E-04, _patience: 9
Epoch: 13 | train_loss: 0.00122, val_loss: 0.00234

[32m[I 2021-09-02 01:44:39,778][0m Trial 16 finished with value: 0.6780570770943167 and parameters: {'embedding_dim': 140, 'num_filters': 491, 'hidden_dim': 504, 'dropout_p': 0.39559363158799893, 'lr': 0.00026756160114541246}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.01075, val_loss: 0.00540, lr: 4.81E-04, _patience: 10
Epoch: 2 | train_loss: 0.00541, val_loss: 0.00413, lr: 4.81E-04, _patience: 10
Epoch: 3 | train_loss: 0.00405, val_loss: 0.00356, lr: 4.81E-04, _patience: 10
Epoch: 4 | train_loss: 0.00334, val_loss: 0.00298, lr: 4.81E-04, _patience: 10
Epoch: 5 | train_loss: 0.00273, val_loss: 0.00262, lr: 4.81E-04, _patience: 10
Epoch: 6 | train_loss: 0.00231, val_loss: 0.00246, lr: 4.81E-04, _patience: 10
Epoch: 7 | train_loss: 0.00196, val_loss: 0.00236, lr: 4.81E-04, _patience: 10
Epoch: 8 | train_loss: 0.00163, val_loss: 0.00227, lr: 4.81E-04, _patience: 10
Epoch: 9 | train_loss: 0.00140, val_loss: 0.00228, lr: 4.81E-04, _patience: 9
Epoch: 10 | train_loss: 0.00115, val_loss: 0.00229, lr: 4.81E-04, _patience: 8
Epoch: 11 | train_loss: 0.00094, val_loss: 0.00228, lr: 4.81E-04, _patience: 7
Epoch: 12 | train_loss: 0.00083, val_loss: 0.00228, lr: 4.81E-04, _patience: 6
Epoch: 13 | train_loss: 0.00066, val_loss: 0.00246, l

[32m[I 2021-09-02 01:45:36,733][0m Trial 17 finished with value: 0.6754963510830186 and parameters: {'embedding_dim': 242, 'num_filters': 476, 'hidden_dim': 359, 'dropout_p': 0.5128207090403741, 'lr': 0.00048060762144196015}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00652, val_loss: 0.00512, lr: 5.02E-05, _patience: 10
Epoch: 2 | train_loss: 0.00498, val_loss: 0.00458, lr: 5.02E-05, _patience: 10
Epoch: 3 | train_loss: 0.00444, val_loss: 0.00414, lr: 5.02E-05, _patience: 10
Epoch: 4 | train_loss: 0.00427, val_loss: 0.00410, lr: 5.02E-05, _patience: 10
Epoch: 5 | train_loss: 0.00408, val_loss: 0.00407, lr: 5.02E-05, _patience: 10
Epoch: 6 | train_loss: 0.00405, val_loss: 0.00399, lr: 5.02E-05, _patience: 10
Epoch: 7 | train_loss: 0.00394, val_loss: 0.00392, lr: 5.02E-05, _patience: 10
Epoch: 8 | train_loss: 0.00385, val_loss: 0.00386, lr: 5.02E-05, _patience: 10
Epoch: 9 | train_loss: 0.00372, val_loss: 0.00379, lr: 5.02E-05, _patience: 10
Epoch: 10 | train_loss: 0.00360, val_loss: 0.00370, lr: 5.02E-05, _patience: 10
Epoch: 11 | train_loss: 0.00351, val_loss: 0.00362, lr: 5.02E-05, _patience: 10
Epoch: 12 | train_loss: 0.00339, val_loss: 0.00354, lr: 5.02E-05, _patience: 10
Epoch: 13 | train_loss: 0.00331, val_loss: 0.0034

[32m[I 2021-09-02 01:48:49,348][0m Trial 18 finished with value: 0.6565349082576424 and parameters: {'embedding_dim': 183, 'num_filters': 399, 'hidden_dim': 463, 'dropout_p': 0.5375301426471648, 'lr': 5.022892907417671e-05}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00654, val_loss: 0.00477, lr: 2.07E-04, _patience: 10
Epoch: 2 | train_loss: 0.00434, val_loss: 0.00391, lr: 2.07E-04, _patience: 10
Epoch: 3 | train_loss: 0.00369, val_loss: 0.00358, lr: 2.07E-04, _patience: 10
Epoch: 4 | train_loss: 0.00321, val_loss: 0.00317, lr: 2.07E-04, _patience: 10
Epoch: 5 | train_loss: 0.00278, val_loss: 0.00289, lr: 2.07E-04, _patience: 10
Epoch: 6 | train_loss: 0.00244, val_loss: 0.00266, lr: 2.07E-04, _patience: 10
Epoch: 7 | train_loss: 0.00212, val_loss: 0.00250, lr: 2.07E-04, _patience: 10
Epoch: 8 | train_loss: 0.00183, val_loss: 0.00241, lr: 2.07E-04, _patience: 10
Epoch: 9 | train_loss: 0.00160, val_loss: 0.00230, lr: 2.07E-04, _patience: 10
Epoch: 10 | train_loss: 0.00142, val_loss: 0.00232, lr: 2.07E-04, _patience: 9
Epoch: 11 | train_loss: 0.00126, val_loss: 0.00223, lr: 2.07E-04, _patience: 10
Epoch: 12 | train_loss: 0.00112, val_loss: 0.00220, lr: 2.07E-04, _patience: 10
Epoch: 13 | train_loss: 0.00101, val_loss: 0.00217

[32m[I 2021-09-02 01:50:01,049][0m Trial 19 finished with value: 0.6749437845946652 and parameters: {'embedding_dim': 323, 'num_filters': 331, 'hidden_dim': 436, 'dropout_p': 0.3756055510213373, 'lr': 0.0002072148717956434}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00671, val_loss: 0.00550, lr: 1.33E-04, _patience: 10
Epoch: 2 | train_loss: 0.00451, val_loss: 0.00380, lr: 1.33E-04, _patience: 10
Epoch: 3 | train_loss: 0.00375, val_loss: 0.00355, lr: 1.33E-04, _patience: 10
Epoch: 4 | train_loss: 0.00331, val_loss: 0.00314, lr: 1.33E-04, _patience: 10
Epoch: 5 | train_loss: 0.00288, val_loss: 0.00290, lr: 1.33E-04, _patience: 10
Epoch: 6 | train_loss: 0.00261, val_loss: 0.00272, lr: 1.33E-04, _patience: 10
Epoch: 7 | train_loss: 0.00228, val_loss: 0.00254, lr: 1.33E-04, _patience: 10
Epoch: 8 | train_loss: 0.00201, val_loss: 0.00241, lr: 1.33E-04, _patience: 10
Epoch: 9 | train_loss: 0.00187, val_loss: 0.00232, lr: 1.33E-04, _patience: 10
Epoch: 10 | train_loss: 0.00165, val_loss: 0.00229, lr: 1.33E-04, _patience: 10
Epoch: 11 | train_loss: 0.00150, val_loss: 0.00225, lr: 1.33E-04, _patience: 10
Epoch: 12 | train_loss: 0.00136, val_loss: 0.00220, lr: 1.33E-04, _patience: 10
Epoch: 13 | train_loss: 0.00121, val_loss: 0.0021

[32m[I 2021-09-02 01:51:40,264][0m Trial 20 finished with value: 0.6734628252689255 and parameters: {'embedding_dim': 493, 'num_filters': 426, 'hidden_dim': 365, 'dropout_p': 0.4531425355825031, 'lr': 0.00013343159644667093}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00968, val_loss: 0.00491, lr: 4.96E-04, _patience: 10
Epoch: 2 | train_loss: 0.00494, val_loss: 0.00415, lr: 4.96E-04, _patience: 10
Epoch: 3 | train_loss: 0.00368, val_loss: 0.00343, lr: 4.96E-04, _patience: 10
Epoch: 4 | train_loss: 0.00291, val_loss: 0.00287, lr: 4.96E-04, _patience: 10
Epoch: 5 | train_loss: 0.00231, val_loss: 0.00255, lr: 4.96E-04, _patience: 10
Epoch: 6 | train_loss: 0.00183, val_loss: 0.00239, lr: 4.96E-04, _patience: 10
Epoch: 7 | train_loss: 0.00153, val_loss: 0.00231, lr: 4.96E-04, _patience: 10
Epoch: 8 | train_loss: 0.00121, val_loss: 0.00230, lr: 4.96E-04, _patience: 10
Epoch: 9 | train_loss: 0.00098, val_loss: 0.00231, lr: 4.96E-04, _patience: 9
Epoch: 10 | train_loss: 0.00077, val_loss: 0.00238, lr: 4.96E-04, _patience: 8
Epoch: 11 | train_loss: 0.00060, val_loss: 0.00247, lr: 4.96E-04, _patience: 7
Epoch: 12 | train_loss: 0.00051, val_loss: 0.00257, lr: 4.96E-04, _patience: 6
Epoch: 13 | train_loss: 0.00045, val_loss: 0.00263, l

[32m[I 2021-09-02 01:52:32,747][0m Trial 21 finished with value: 0.6838145202242131 and parameters: {'embedding_dim': 192, 'num_filters': 506, 'hidden_dim': 486, 'dropout_p': 0.3103010505203235, 'lr': 0.0004959529149232021}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.01039, val_loss: 0.00621, lr: 4.98E-04, _patience: 10
Epoch: 2 | train_loss: 0.00533, val_loss: 0.00423, lr: 4.98E-04, _patience: 10
Epoch: 3 | train_loss: 0.00383, val_loss: 0.00360, lr: 4.98E-04, _patience: 10
Epoch: 4 | train_loss: 0.00306, val_loss: 0.00298, lr: 4.98E-04, _patience: 10
Epoch: 5 | train_loss: 0.00245, val_loss: 0.00263, lr: 4.98E-04, _patience: 10
Epoch: 6 | train_loss: 0.00200, val_loss: 0.00248, lr: 4.98E-04, _patience: 10
Epoch: 7 | train_loss: 0.00164, val_loss: 0.00234, lr: 4.98E-04, _patience: 10
Epoch: 8 | train_loss: 0.00131, val_loss: 0.00232, lr: 4.98E-04, _patience: 10
Epoch: 9 | train_loss: 0.00108, val_loss: 0.00233, lr: 4.98E-04, _patience: 9
Epoch: 10 | train_loss: 0.00088, val_loss: 0.00250, lr: 4.98E-04, _patience: 8
Epoch: 11 | train_loss: 0.00071, val_loss: 0.00250, lr: 4.98E-04, _patience: 7
Epoch: 12 | train_loss: 0.00063, val_loss: 0.00252, lr: 4.98E-04, _patience: 6
Epoch: 13 | train_loss: 0.00051, val_loss: 0.00258, l

[32m[I 2021-09-02 01:53:21,956][0m Trial 22 finished with value: 0.6731457425105557 and parameters: {'embedding_dim': 161, 'num_filters': 507, 'hidden_dim': 506, 'dropout_p': 0.3193223079661554, 'lr': 0.0004979294994865425}. Best is trial 1 with value: 0.6878502065377774.[0m


Epoch: 1 | train_loss: 0.00894, val_loss: 0.00607, lr: 3.18E-04, _patience: 10
Epoch: 2 | train_loss: 0.00515, val_loss: 0.00408, lr: 3.18E-04, _patience: 10
Epoch: 3 | train_loss: 0.00383, val_loss: 0.00363, lr: 3.18E-04, _patience: 10
Epoch: 4 | train_loss: 0.00319, val_loss: 0.00313, lr: 3.18E-04, _patience: 10
Epoch: 5 | train_loss: 0.00265, val_loss: 0.00277, lr: 3.18E-04, _patience: 10
Epoch: 6 | train_loss: 0.00225, val_loss: 0.00262, lr: 3.18E-04, _patience: 10
Epoch: 7 | train_loss: 0.00191, val_loss: 0.00246, lr: 3.18E-04, _patience: 10
Epoch: 8 | train_loss: 0.00164, val_loss: 0.00240, lr: 3.18E-04, _patience: 10
Epoch: 9 | train_loss: 0.00135, val_loss: 0.00228, lr: 3.18E-04, _patience: 10
Epoch: 10 | train_loss: 0.00112, val_loss: 0.00227, lr: 3.18E-04, _patience: 10
Epoch: 11 | train_loss: 0.00097, val_loss: 0.00242, lr: 3.18E-04, _patience: 9
Epoch: 12 | train_loss: 0.00087, val_loss: 0.00235, lr: 3.18E-04, _patience: 8
Epoch: 13 | train_loss: 0.00073, val_loss: 0.00241,

[32m[I 2021-09-02 01:54:24,108][0m Trial 23 finished with value: 0.6893605235149695 and parameters: {'embedding_dim': 231, 'num_filters': 476, 'hidden_dim': 471, 'dropout_p': 0.39253220838548886, 'lr': 0.0003177277068343745}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00840, val_loss: 0.00515, lr: 3.22E-04, _patience: 10
Epoch: 2 | train_loss: 0.00506, val_loss: 0.00400, lr: 3.22E-04, _patience: 10
Epoch: 3 | train_loss: 0.00384, val_loss: 0.00353, lr: 3.22E-04, _patience: 10
Epoch: 4 | train_loss: 0.00312, val_loss: 0.00299, lr: 3.22E-04, _patience: 10
Epoch: 5 | train_loss: 0.00255, val_loss: 0.00266, lr: 3.22E-04, _patience: 10
Epoch: 6 | train_loss: 0.00216, val_loss: 0.00251, lr: 3.22E-04, _patience: 10
Epoch: 7 | train_loss: 0.00184, val_loss: 0.00241, lr: 3.22E-04, _patience: 10
Epoch: 8 | train_loss: 0.00156, val_loss: 0.00232, lr: 3.22E-04, _patience: 10
Epoch: 9 | train_loss: 0.00131, val_loss: 0.00225, lr: 3.22E-04, _patience: 10
Epoch: 10 | train_loss: 0.00106, val_loss: 0.00227, lr: 3.22E-04, _patience: 9
Epoch: 11 | train_loss: 0.00094, val_loss: 0.00226, lr: 3.22E-04, _patience: 8
Epoch: 12 | train_loss: 0.00078, val_loss: 0.00226, lr: 3.22E-04, _patience: 7
Epoch: 13 | train_loss: 0.00068, val_loss: 0.00227, 

[32m[I 2021-09-02 01:55:22,988][0m Trial 24 finished with value: 0.6692339705203999 and parameters: {'embedding_dim': 234, 'num_filters': 472, 'hidden_dim': 482, 'dropout_p': 0.39636368855592796, 'lr': 0.0003223403889074547}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00827, val_loss: 0.00521, lr: 3.62E-04, _patience: 10
Epoch: 2 | train_loss: 0.00477, val_loss: 0.00409, lr: 3.62E-04, _patience: 10
Epoch: 3 | train_loss: 0.00365, val_loss: 0.00343, lr: 3.62E-04, _patience: 10
Epoch: 4 | train_loss: 0.00290, val_loss: 0.00293, lr: 3.62E-04, _patience: 10
Epoch: 5 | train_loss: 0.00233, val_loss: 0.00260, lr: 3.62E-04, _patience: 10
Epoch: 6 | train_loss: 0.00190, val_loss: 0.00242, lr: 3.62E-04, _patience: 10
Epoch: 7 | train_loss: 0.00156, val_loss: 0.00237, lr: 3.62E-04, _patience: 10
Epoch: 8 | train_loss: 0.00131, val_loss: 0.00235, lr: 3.62E-04, _patience: 10
Epoch: 9 | train_loss: 0.00111, val_loss: 0.00237, lr: 3.62E-04, _patience: 9
Epoch: 10 | train_loss: 0.00094, val_loss: 0.00238, lr: 3.62E-04, _patience: 8
Epoch: 11 | train_loss: 0.00079, val_loss: 0.00238, lr: 3.62E-04, _patience: 7
Epoch: 12 | train_loss: 0.00071, val_loss: 0.00259, lr: 3.62E-04, _patience: 6
Epoch: 13 | train_loss: 0.00063, val_loss: 0.00266, l

[32m[I 2021-09-02 01:56:15,218][0m Trial 25 finished with value: 0.6728039551963331 and parameters: {'embedding_dim': 199, 'num_filters': 477, 'hidden_dim': 450, 'dropout_p': 0.30369084382162465, 'lr': 0.0003618977945722521}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00899, val_loss: 0.00507, lr: 4.25E-04, _patience: 10
Epoch: 2 | train_loss: 0.00473, val_loss: 0.00393, lr: 4.25E-04, _patience: 10
Epoch: 3 | train_loss: 0.00345, val_loss: 0.00320, lr: 4.25E-04, _patience: 10
Epoch: 4 | train_loss: 0.00277, val_loss: 0.00271, lr: 4.25E-04, _patience: 10
Epoch: 5 | train_loss: 0.00223, val_loss: 0.00247, lr: 4.25E-04, _patience: 10
Epoch: 6 | train_loss: 0.00178, val_loss: 0.00228, lr: 4.25E-04, _patience: 10
Epoch: 7 | train_loss: 0.00145, val_loss: 0.00224, lr: 4.25E-04, _patience: 10
Epoch: 8 | train_loss: 0.00113, val_loss: 0.00226, lr: 4.25E-04, _patience: 9
Epoch: 9 | train_loss: 0.00093, val_loss: 0.00234, lr: 4.25E-04, _patience: 8
Epoch: 10 | train_loss: 0.00079, val_loss: 0.00245, lr: 4.25E-04, _patience: 7
Epoch: 11 | train_loss: 0.00065, val_loss: 0.00271, lr: 4.25E-04, _patience: 6
Epoch: 12 | train_loss: 0.00062, val_loss: 0.00277, lr: 4.25E-04, _patience: 5
Epoch: 13 | train_loss: 0.00050, val_loss: 0.00275, lr

[32m[I 2021-09-02 01:57:10,126][0m Trial 26 finished with value: 0.687146310954673 and parameters: {'embedding_dim': 240, 'num_filters': 504, 'hidden_dim': 485, 'dropout_p': 0.3649819780181304, 'lr': 0.00042539947611324364}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00858, val_loss: 0.00492, lr: 4.16E-04, _patience: 10
Epoch: 2 | train_loss: 0.00480, val_loss: 0.00406, lr: 4.16E-04, _patience: 10
Epoch: 3 | train_loss: 0.00361, val_loss: 0.00331, lr: 4.16E-04, _patience: 10
Epoch: 4 | train_loss: 0.00289, val_loss: 0.00275, lr: 4.16E-04, _patience: 10
Epoch: 5 | train_loss: 0.00232, val_loss: 0.00248, lr: 4.16E-04, _patience: 10
Epoch: 6 | train_loss: 0.00192, val_loss: 0.00235, lr: 4.16E-04, _patience: 10
Epoch: 7 | train_loss: 0.00162, val_loss: 0.00233, lr: 4.16E-04, _patience: 10
Epoch: 8 | train_loss: 0.00130, val_loss: 0.00222, lr: 4.16E-04, _patience: 10
Epoch: 9 | train_loss: 0.00106, val_loss: 0.00220, lr: 4.16E-04, _patience: 10
Epoch: 10 | train_loss: 0.00090, val_loss: 0.00231, lr: 4.16E-04, _patience: 9
Epoch: 11 | train_loss: 0.00073, val_loss: 0.00244, lr: 4.16E-04, _patience: 8
Epoch: 12 | train_loss: 0.00062, val_loss: 0.00255, lr: 4.16E-04, _patience: 7
Epoch: 13 | train_loss: 0.00055, val_loss: 0.00259, 

[32m[I 2021-09-02 01:58:06,820][0m Trial 27 finished with value: 0.6854237586527557 and parameters: {'embedding_dim': 246, 'num_filters': 417, 'hidden_dim': 471, 'dropout_p': 0.3668527628702528, 'lr': 0.0004156393597152231}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00763, val_loss: 0.00511, lr: 2.83E-04, _patience: 10
Epoch: 2 | train_loss: 0.00477, val_loss: 0.00399, lr: 2.83E-04, _patience: 10
Epoch: 3 | train_loss: 0.00383, val_loss: 0.00353, lr: 2.83E-04, _patience: 10
Epoch: 4 | train_loss: 0.00317, val_loss: 0.00304, lr: 2.83E-04, _patience: 10
Epoch: 5 | train_loss: 0.00267, val_loss: 0.00270, lr: 2.83E-04, _patience: 10
Epoch: 6 | train_loss: 0.00229, val_loss: 0.00250, lr: 2.83E-04, _patience: 10
Epoch: 7 | train_loss: 0.00197, val_loss: 0.00238, lr: 2.83E-04, _patience: 10
Epoch: 8 | train_loss: 0.00169, val_loss: 0.00225, lr: 2.83E-04, _patience: 10
Epoch: 9 | train_loss: 0.00145, val_loss: 0.00221, lr: 2.83E-04, _patience: 10
Epoch: 10 | train_loss: 0.00129, val_loss: 0.00226, lr: 2.83E-04, _patience: 9
Epoch: 11 | train_loss: 0.00112, val_loss: 0.00220, lr: 2.83E-04, _patience: 10
Epoch: 12 | train_loss: 0.00095, val_loss: 0.00225, lr: 2.83E-04, _patience: 9
Epoch: 13 | train_loss: 0.00082, val_loss: 0.00226,

[32m[I 2021-09-02 01:59:10,974][0m Trial 28 finished with value: 0.6804190420257422 and parameters: {'embedding_dim': 294, 'num_filters': 368, 'hidden_dim': 511, 'dropout_p': 0.4803885155217381, 'lr': 0.0002830585969387563}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00870, val_loss: 0.00562, lr: 3.05E-04, _patience: 10
Epoch: 2 | train_loss: 0.00556, val_loss: 0.00402, lr: 3.05E-04, _patience: 10
Epoch: 3 | train_loss: 0.00446, val_loss: 0.00374, lr: 3.05E-04, _patience: 10
Epoch: 4 | train_loss: 0.00383, val_loss: 0.00334, lr: 3.05E-04, _patience: 10
Epoch: 5 | train_loss: 0.00333, val_loss: 0.00299, lr: 3.05E-04, _patience: 10
Epoch: 6 | train_loss: 0.00294, val_loss: 0.00276, lr: 3.05E-04, _patience: 10
Epoch: 7 | train_loss: 0.00263, val_loss: 0.00257, lr: 3.05E-04, _patience: 10
Epoch: 8 | train_loss: 0.00232, val_loss: 0.00249, lr: 3.05E-04, _patience: 10
Epoch: 9 | train_loss: 0.00205, val_loss: 0.00238, lr: 3.05E-04, _patience: 10
Epoch: 10 | train_loss: 0.00189, val_loss: 0.00236, lr: 3.05E-04, _patience: 10
Epoch: 11 | train_loss: 0.00169, val_loss: 0.00231, lr: 3.05E-04, _patience: 10
Epoch: 12 | train_loss: 0.00150, val_loss: 0.00229, lr: 3.05E-04, _patience: 10
Epoch: 13 | train_loss: 0.00139, val_loss: 0.0021

[32m[I 2021-09-02 02:00:35,786][0m Trial 29 finished with value: 0.6792903912236194 and parameters: {'embedding_dim': 229, 'num_filters': 466, 'hidden_dim': 230, 'dropout_p': 0.5807220854326729, 'lr': 0.0003053725835980111}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00632, val_loss: 0.00509, lr: 1.83E-04, _patience: 10
Epoch: 2 | train_loss: 0.00434, val_loss: 0.00396, lr: 1.83E-04, _patience: 10
Epoch: 3 | train_loss: 0.00383, val_loss: 0.00373, lr: 1.83E-04, _patience: 10
Epoch: 4 | train_loss: 0.00335, val_loss: 0.00333, lr: 1.83E-04, _patience: 10
Epoch: 5 | train_loss: 0.00295, val_loss: 0.00302, lr: 1.83E-04, _patience: 10
Epoch: 6 | train_loss: 0.00259, val_loss: 0.00281, lr: 1.83E-04, _patience: 10
Epoch: 7 | train_loss: 0.00226, val_loss: 0.00267, lr: 1.83E-04, _patience: 10
Epoch: 8 | train_loss: 0.00207, val_loss: 0.00255, lr: 1.83E-04, _patience: 10
Epoch: 9 | train_loss: 0.00179, val_loss: 0.00248, lr: 1.83E-04, _patience: 10
Epoch: 10 | train_loss: 0.00163, val_loss: 0.00244, lr: 1.83E-04, _patience: 10
Epoch: 11 | train_loss: 0.00145, val_loss: 0.00240, lr: 1.83E-04, _patience: 10
Epoch: 12 | train_loss: 0.00128, val_loss: 0.00239, lr: 1.83E-04, _patience: 10
Epoch: 13 | train_loss: 0.00114, val_loss: 0.0023

[32m[I 2021-09-02 02:01:49,360][0m Trial 30 finished with value: 0.667001578430888 and parameters: {'embedding_dim': 361, 'num_filters': 262, 'hidden_dim': 437, 'dropout_p': 0.41034286882129384, 'lr': 0.00018292032112495122}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00863, val_loss: 0.00508, lr: 4.12E-04, _patience: 10
Epoch: 2 | train_loss: 0.00479, val_loss: 0.00408, lr: 4.12E-04, _patience: 10
Epoch: 3 | train_loss: 0.00364, val_loss: 0.00337, lr: 4.12E-04, _patience: 10
Epoch: 4 | train_loss: 0.00289, val_loss: 0.00283, lr: 4.12E-04, _patience: 10
Epoch: 5 | train_loss: 0.00233, val_loss: 0.00261, lr: 4.12E-04, _patience: 10
Epoch: 6 | train_loss: 0.00191, val_loss: 0.00243, lr: 4.12E-04, _patience: 10
Epoch: 7 | train_loss: 0.00154, val_loss: 0.00235, lr: 4.12E-04, _patience: 10
Epoch: 8 | train_loss: 0.00129, val_loss: 0.00231, lr: 4.12E-04, _patience: 10
Epoch: 9 | train_loss: 0.00105, val_loss: 0.00240, lr: 4.12E-04, _patience: 9
Epoch: 10 | train_loss: 0.00089, val_loss: 0.00244, lr: 4.12E-04, _patience: 8
Epoch: 11 | train_loss: 0.00074, val_loss: 0.00256, lr: 4.12E-04, _patience: 7
Epoch: 12 | train_loss: 0.00062, val_loss: 0.00265, lr: 4.12E-04, _patience: 6
Epoch: 13 | train_loss: 0.00054, val_loss: 0.00267, l

[32m[I 2021-09-02 02:02:43,113][0m Trial 31 finished with value: 0.6679073636059011 and parameters: {'embedding_dim': 246, 'num_filters': 415, 'hidden_dim': 475, 'dropout_p': 0.3630162670340257, 'lr': 0.00041166830654573163}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00897, val_loss: 0.00531, lr: 4.50E-04, _patience: 10
Epoch: 2 | train_loss: 0.00508, val_loss: 0.00384, lr: 4.50E-04, _patience: 10
Epoch: 3 | train_loss: 0.00385, val_loss: 0.00327, lr: 4.50E-04, _patience: 10
Epoch: 4 | train_loss: 0.00311, val_loss: 0.00286, lr: 4.50E-04, _patience: 10
Epoch: 5 | train_loss: 0.00256, val_loss: 0.00259, lr: 4.50E-04, _patience: 10
Epoch: 6 | train_loss: 0.00205, val_loss: 0.00252, lr: 4.50E-04, _patience: 10
Epoch: 7 | train_loss: 0.00181, val_loss: 0.00235, lr: 4.50E-04, _patience: 10
Epoch: 8 | train_loss: 0.00147, val_loss: 0.00235, lr: 4.50E-04, _patience: 10
Epoch: 9 | train_loss: 0.00121, val_loss: 0.00238, lr: 4.50E-04, _patience: 9
Epoch: 10 | train_loss: 0.00108, val_loss: 0.00246, lr: 4.50E-04, _patience: 8
Epoch: 11 | train_loss: 0.00089, val_loss: 0.00239, lr: 4.50E-04, _patience: 7
Epoch: 12 | train_loss: 0.00083, val_loss: 0.00237, lr: 4.50E-04, _patience: 6
Epoch: 13 | train_loss: 0.00074, val_loss: 0.00238, l

[32m[I 2021-09-02 02:03:44,765][0m Trial 32 finished with value: 0.6833612795071888 and parameters: {'embedding_dim': 262, 'num_filters': 490, 'hidden_dim': 176, 'dropout_p': 0.3716395513513812, 'lr': 0.0004504747122905832}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00868, val_loss: 0.00518, lr: 3.66E-04, _patience: 10
Epoch: 2 | train_loss: 0.00479, val_loss: 0.00406, lr: 3.66E-04, _patience: 10
Epoch: 3 | train_loss: 0.00369, val_loss: 0.00340, lr: 3.66E-04, _patience: 10
Epoch: 4 | train_loss: 0.00299, val_loss: 0.00287, lr: 3.66E-04, _patience: 10
Epoch: 5 | train_loss: 0.00239, val_loss: 0.00258, lr: 3.66E-04, _patience: 10
Epoch: 6 | train_loss: 0.00199, val_loss: 0.00243, lr: 3.66E-04, _patience: 10
Epoch: 7 | train_loss: 0.00167, val_loss: 0.00234, lr: 3.66E-04, _patience: 10
Epoch: 8 | train_loss: 0.00137, val_loss: 0.00230, lr: 3.66E-04, _patience: 10
Epoch: 9 | train_loss: 0.00112, val_loss: 0.00229, lr: 3.66E-04, _patience: 10
Epoch: 10 | train_loss: 0.00094, val_loss: 0.00236, lr: 3.66E-04, _patience: 9
Epoch: 11 | train_loss: 0.00078, val_loss: 0.00242, lr: 3.66E-04, _patience: 8
Epoch: 12 | train_loss: 0.00065, val_loss: 0.00248, lr: 3.66E-04, _patience: 7
Epoch: 13 | train_loss: 0.00059, val_loss: 0.00256, 

[32m[I 2021-09-02 02:04:39,192][0m Trial 33 finished with value: 0.6667343823883983 and parameters: {'embedding_dim': 208, 'num_filters': 460, 'hidden_dim': 495, 'dropout_p': 0.336836212053763, 'lr': 0.00036558196062431693}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00737, val_loss: 0.00514, lr: 2.50E-04, _patience: 10
Epoch: 2 | train_loss: 0.00465, val_loss: 0.00396, lr: 2.50E-04, _patience: 10
Epoch: 3 | train_loss: 0.00385, val_loss: 0.00353, lr: 2.50E-04, _patience: 10
Epoch: 4 | train_loss: 0.00323, val_loss: 0.00314, lr: 2.50E-04, _patience: 10
Epoch: 5 | train_loss: 0.00271, val_loss: 0.00282, lr: 2.50E-04, _patience: 10
Epoch: 6 | train_loss: 0.00233, val_loss: 0.00262, lr: 2.50E-04, _patience: 10
Epoch: 7 | train_loss: 0.00205, val_loss: 0.00250, lr: 2.50E-04, _patience: 10
Epoch: 8 | train_loss: 0.00175, val_loss: 0.00241, lr: 2.50E-04, _patience: 10
Epoch: 9 | train_loss: 0.00153, val_loss: 0.00233, lr: 2.50E-04, _patience: 10
Epoch: 10 | train_loss: 0.00130, val_loss: 0.00230, lr: 2.50E-04, _patience: 10
Epoch: 11 | train_loss: 0.00115, val_loss: 0.00229, lr: 2.50E-04, _patience: 10
Epoch: 12 | train_loss: 0.00101, val_loss: 0.00225, lr: 2.50E-04, _patience: 10
Epoch: 13 | train_loss: 0.00089, val_loss: 0.0022

[32m[I 2021-09-02 02:05:53,956][0m Trial 34 finished with value: 0.6645582930196803 and parameters: {'embedding_dim': 301, 'num_filters': 414, 'hidden_dim': 441, 'dropout_p': 0.454128690583519, 'lr': 0.0002498598625596354}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00889, val_loss: 0.00538, lr: 3.56E-04, _patience: 10
Epoch: 2 | train_loss: 0.00497, val_loss: 0.00395, lr: 3.56E-04, _patience: 10
Epoch: 3 | train_loss: 0.00368, val_loss: 0.00341, lr: 3.56E-04, _patience: 10
Epoch: 4 | train_loss: 0.00301, val_loss: 0.00289, lr: 3.56E-04, _patience: 10
Epoch: 5 | train_loss: 0.00246, val_loss: 0.00262, lr: 3.56E-04, _patience: 10
Epoch: 6 | train_loss: 0.00199, val_loss: 0.00246, lr: 3.56E-04, _patience: 10
Epoch: 7 | train_loss: 0.00167, val_loss: 0.00236, lr: 3.56E-04, _patience: 10
Epoch: 8 | train_loss: 0.00136, val_loss: 0.00227, lr: 3.56E-04, _patience: 10
Epoch: 9 | train_loss: 0.00114, val_loss: 0.00226, lr: 3.56E-04, _patience: 10
Epoch: 10 | train_loss: 0.00096, val_loss: 0.00227, lr: 3.56E-04, _patience: 9
Epoch: 11 | train_loss: 0.00083, val_loss: 0.00241, lr: 3.56E-04, _patience: 8
Epoch: 12 | train_loss: 0.00068, val_loss: 0.00245, lr: 3.56E-04, _patience: 7
Epoch: 13 | train_loss: 0.00061, val_loss: 0.00253, 

[32m[I 2021-09-02 02:06:59,971][0m Trial 35 finished with value: 0.6759572363932831 and parameters: {'embedding_dim': 270, 'num_filters': 489, 'hidden_dim': 385, 'dropout_p': 0.389991661901649, 'lr': 0.00035554349847071885}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00865, val_loss: 0.00472, lr: 4.36E-04, _patience: 10
Epoch: 2 | train_loss: 0.00480, val_loss: 0.00407, lr: 4.36E-04, _patience: 10
Epoch: 3 | train_loss: 0.00369, val_loss: 0.00333, lr: 4.36E-04, _patience: 10
Epoch: 4 | train_loss: 0.00292, val_loss: 0.00286, lr: 4.36E-04, _patience: 10
Epoch: 5 | train_loss: 0.00242, val_loss: 0.00258, lr: 4.36E-04, _patience: 10
Epoch: 6 | train_loss: 0.00195, val_loss: 0.00240, lr: 4.36E-04, _patience: 10
Epoch: 7 | train_loss: 0.00162, val_loss: 0.00228, lr: 4.36E-04, _patience: 10
Epoch: 8 | train_loss: 0.00134, val_loss: 0.00234, lr: 4.36E-04, _patience: 9
Epoch: 9 | train_loss: 0.00110, val_loss: 0.00240, lr: 4.36E-04, _patience: 8
Epoch: 10 | train_loss: 0.00095, val_loss: 0.00254, lr: 4.36E-04, _patience: 7
Epoch: 11 | train_loss: 0.00080, val_loss: 0.00243, lr: 4.36E-04, _patience: 6
Epoch: 12 | train_loss: 0.00066, val_loss: 0.00284, lr: 4.36E-04, _patience: 5
Epoch: 13 | train_loss: 0.00061, val_loss: 0.00268, lr

[32m[I 2021-09-02 02:07:55,902][0m Trial 36 finished with value: 0.6723790436288055 and parameters: {'embedding_dim': 347, 'num_filters': 364, 'hidden_dim': 465, 'dropout_p': 0.5154961752966003, 'lr': 0.00043597679421147813}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00673, val_loss: 0.00517, lr: 2.02E-04, _patience: 10
Epoch: 2 | train_loss: 0.00474, val_loss: 0.00403, lr: 2.02E-04, _patience: 10
Epoch: 3 | train_loss: 0.00405, val_loss: 0.00376, lr: 2.02E-04, _patience: 10
Epoch: 4 | train_loss: 0.00357, val_loss: 0.00346, lr: 2.02E-04, _patience: 10
Epoch: 5 | train_loss: 0.00318, val_loss: 0.00311, lr: 2.02E-04, _patience: 10
Epoch: 6 | train_loss: 0.00284, val_loss: 0.00281, lr: 2.02E-04, _patience: 10
Epoch: 7 | train_loss: 0.00250, val_loss: 0.00260, lr: 2.02E-04, _patience: 10
Epoch: 8 | train_loss: 0.00223, val_loss: 0.00248, lr: 2.02E-04, _patience: 10
Epoch: 9 | train_loss: 0.00200, val_loss: 0.00240, lr: 2.02E-04, _patience: 10
Epoch: 10 | train_loss: 0.00181, val_loss: 0.00231, lr: 2.02E-04, _patience: 10
Epoch: 11 | train_loss: 0.00161, val_loss: 0.00227, lr: 2.02E-04, _patience: 10
Epoch: 12 | train_loss: 0.00151, val_loss: 0.00221, lr: 2.02E-04, _patience: 10
Epoch: 13 | train_loss: 0.00134, val_loss: 0.0022

[32m[I 2021-09-02 02:09:04,218][0m Trial 37 finished with value: 0.6767903152732834 and parameters: {'embedding_dim': 216, 'num_filters': 439, 'hidden_dim': 320, 'dropout_p': 0.42297490917480796, 'lr': 0.00020227149215172236}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00736, val_loss: 0.00536, lr: 2.33E-04, _patience: 10
Epoch: 2 | train_loss: 0.00505, val_loss: 0.00395, lr: 2.33E-04, _patience: 10
Epoch: 3 | train_loss: 0.00421, val_loss: 0.00369, lr: 2.33E-04, _patience: 10
Epoch: 4 | train_loss: 0.00366, val_loss: 0.00333, lr: 2.33E-04, _patience: 10
Epoch: 5 | train_loss: 0.00314, val_loss: 0.00296, lr: 2.33E-04, _patience: 10
Epoch: 6 | train_loss: 0.00279, val_loss: 0.00275, lr: 2.33E-04, _patience: 10
Epoch: 7 | train_loss: 0.00248, val_loss: 0.00257, lr: 2.33E-04, _patience: 10
Epoch: 8 | train_loss: 0.00220, val_loss: 0.00249, lr: 2.33E-04, _patience: 10
Epoch: 9 | train_loss: 0.00204, val_loss: 0.00241, lr: 2.33E-04, _patience: 10
Epoch: 10 | train_loss: 0.00178, val_loss: 0.00234, lr: 2.33E-04, _patience: 10
Epoch: 11 | train_loss: 0.00161, val_loss: 0.00232, lr: 2.33E-04, _patience: 10
Epoch: 12 | train_loss: 0.00144, val_loss: 0.00238, lr: 2.33E-04, _patience: 9
Epoch: 13 | train_loss: 0.00132, val_loss: 0.00230

[32m[I 2021-09-02 02:10:31,564][0m Trial 38 finished with value: 0.6675077310794176 and parameters: {'embedding_dim': 301, 'num_filters': 335, 'hidden_dim': 421, 'dropout_p': 0.5727492035754438, 'lr': 0.0002334774066911049}. Best is trial 23 with value: 0.6893605235149695.[0m


Epoch: 1 | train_loss: 0.00763, val_loss: 0.00471, lr: 2.84E-04, _patience: 10
Epoch: 2 | train_loss: 0.00451, val_loss: 0.00371, lr: 2.84E-04, _patience: 10
Epoch: 3 | train_loss: 0.00335, val_loss: 0.00305, lr: 2.84E-04, _patience: 10
Epoch: 4 | train_loss: 0.00263, val_loss: 0.00268, lr: 2.84E-04, _patience: 10
Epoch: 5 | train_loss: 0.00217, val_loss: 0.00246, lr: 2.84E-04, _patience: 10
Epoch: 6 | train_loss: 0.00180, val_loss: 0.00229, lr: 2.84E-04, _patience: 10
Epoch: 7 | train_loss: 0.00145, val_loss: 0.00228, lr: 2.84E-04, _patience: 10
Epoch: 8 | train_loss: 0.00118, val_loss: 0.00221, lr: 2.84E-04, _patience: 10
Epoch: 9 | train_loss: 0.00100, val_loss: 0.00223, lr: 2.84E-04, _patience: 9
Epoch: 10 | train_loss: 0.00086, val_loss: 0.00225, lr: 2.84E-04, _patience: 8
Epoch: 11 | train_loss: 0.00074, val_loss: 0.00230, lr: 2.84E-04, _patience: 7
Epoch: 12 | train_loss: 0.00068, val_loss: 0.00235, lr: 2.84E-04, _patience: 6
Epoch: 13 | train_loss: 0.00059, val_loss: 0.00235, l

[32m[I 2021-09-02 02:11:49,138][0m Trial 39 finished with value: 0.6911921640685477 and parameters: {'embedding_dim': 407, 'num_filters': 510, 'hidden_dim': 334, 'dropout_p': 0.3455200387912171, 'lr': 0.00028381065170221184}. Best is trial 39 with value: 0.6911921640685477.[0m


Epoch: 1 | train_loss: 0.00690, val_loss: 0.00537, lr: 1.72E-04, _patience: 10
Epoch: 2 | train_loss: 0.00436, val_loss: 0.00378, lr: 1.72E-04, _patience: 10
Epoch: 3 | train_loss: 0.00355, val_loss: 0.00345, lr: 1.72E-04, _patience: 10
Epoch: 4 | train_loss: 0.00303, val_loss: 0.00299, lr: 1.72E-04, _patience: 10
Epoch: 5 | train_loss: 0.00256, val_loss: 0.00273, lr: 1.72E-04, _patience: 10
Epoch: 6 | train_loss: 0.00219, val_loss: 0.00259, lr: 1.72E-04, _patience: 10
Epoch: 7 | train_loss: 0.00191, val_loss: 0.00250, lr: 1.72E-04, _patience: 10
Epoch: 8 | train_loss: 0.00167, val_loss: 0.00242, lr: 1.72E-04, _patience: 10
Epoch: 9 | train_loss: 0.00149, val_loss: 0.00236, lr: 1.72E-04, _patience: 10
Epoch: 10 | train_loss: 0.00131, val_loss: 0.00233, lr: 1.72E-04, _patience: 10
Epoch: 11 | train_loss: 0.00116, val_loss: 0.00237, lr: 1.72E-04, _patience: 9
Epoch: 12 | train_loss: 0.00103, val_loss: 0.00232, lr: 1.72E-04, _patience: 10
Epoch: 13 | train_loss: 0.00092, val_loss: 0.00239

[32m[I 2021-09-02 02:13:32,089][0m Trial 40 finished with value: 0.6872144231049917 and parameters: {'embedding_dim': 422, 'num_filters': 496, 'hidden_dim': 323, 'dropout_p': 0.3334063796286489, 'lr': 0.00017247917135335329}. Best is trial 39 with value: 0.6911921640685477.[0m


Epoch: 1 | train_loss: 0.00674, val_loss: 0.00533, lr: 1.57E-04, _patience: 10
Epoch: 2 | train_loss: 0.00436, val_loss: 0.00376, lr: 1.57E-04, _patience: 10
Epoch: 3 | train_loss: 0.00358, val_loss: 0.00346, lr: 1.57E-04, _patience: 10
Epoch: 4 | train_loss: 0.00301, val_loss: 0.00305, lr: 1.57E-04, _patience: 10
Epoch: 5 | train_loss: 0.00256, val_loss: 0.00279, lr: 1.57E-04, _patience: 10
Epoch: 6 | train_loss: 0.00227, val_loss: 0.00264, lr: 1.57E-04, _patience: 10
Epoch: 7 | train_loss: 0.00200, val_loss: 0.00247, lr: 1.57E-04, _patience: 10
Epoch: 8 | train_loss: 0.00173, val_loss: 0.00243, lr: 1.57E-04, _patience: 10
Epoch: 9 | train_loss: 0.00152, val_loss: 0.00233, lr: 1.57E-04, _patience: 10
Epoch: 10 | train_loss: 0.00134, val_loss: 0.00229, lr: 1.57E-04, _patience: 10
Epoch: 11 | train_loss: 0.00121, val_loss: 0.00227, lr: 1.57E-04, _patience: 10
Epoch: 12 | train_loss: 0.00110, val_loss: 0.00222, lr: 1.57E-04, _patience: 10
Epoch: 13 | train_loss: 0.00096, val_loss: 0.0022

[32m[I 2021-09-02 02:15:11,013][0m Trial 41 finished with value: 0.6775864392956321 and parameters: {'embedding_dim': 440, 'num_filters': 512, 'hidden_dim': 323, 'dropout_p': 0.334420287064763, 'lr': 0.00015653552494189412}. Best is trial 39 with value: 0.6911921640685477.[0m


Epoch: 1 | train_loss: 0.00687, val_loss: 0.00557, lr: 1.73E-04, _patience: 10
Epoch: 2 | train_loss: 0.00458, val_loss: 0.00382, lr: 1.73E-04, _patience: 10
Epoch: 3 | train_loss: 0.00373, val_loss: 0.00353, lr: 1.73E-04, _patience: 10
Epoch: 4 | train_loss: 0.00320, val_loss: 0.00306, lr: 1.73E-04, _patience: 10
Epoch: 5 | train_loss: 0.00274, val_loss: 0.00279, lr: 1.73E-04, _patience: 10
Epoch: 6 | train_loss: 0.00241, val_loss: 0.00259, lr: 1.73E-04, _patience: 10
Epoch: 7 | train_loss: 0.00205, val_loss: 0.00249, lr: 1.73E-04, _patience: 10
Epoch: 8 | train_loss: 0.00184, val_loss: 0.00238, lr: 1.73E-04, _patience: 10
Epoch: 9 | train_loss: 0.00164, val_loss: 0.00232, lr: 1.73E-04, _patience: 10
Epoch: 10 | train_loss: 0.00142, val_loss: 0.00230, lr: 1.73E-04, _patience: 10
Epoch: 11 | train_loss: 0.00131, val_loss: 0.00233, lr: 1.73E-04, _patience: 9
Epoch: 12 | train_loss: 0.00114, val_loss: 0.00235, lr: 1.73E-04, _patience: 8
Epoch: 13 | train_loss: 0.00102, val_loss: 0.00234,

[32m[I 2021-09-02 02:17:29,279][0m Trial 42 finished with value: 0.6800139643269503 and parameters: {'embedding_dim': 411, 'num_filters': 490, 'hidden_dim': 278, 'dropout_p': 0.3578822324444334, 'lr': 0.00017252414522322019}. Best is trial 39 with value: 0.6911921640685477.[0m


Epoch: 1 | train_loss: 0.00648, val_loss: 0.00546, lr: 1.39E-04, _patience: 10
Epoch: 2 | train_loss: 0.00435, val_loss: 0.00382, lr: 1.39E-04, _patience: 10
Epoch: 3 | train_loss: 0.00361, val_loss: 0.00354, lr: 1.39E-04, _patience: 10
Epoch: 4 | train_loss: 0.00314, val_loss: 0.00310, lr: 1.39E-04, _patience: 10
Epoch: 5 | train_loss: 0.00272, val_loss: 0.00285, lr: 1.39E-04, _patience: 10
Epoch: 6 | train_loss: 0.00236, val_loss: 0.00263, lr: 1.39E-04, _patience: 10
Epoch: 7 | train_loss: 0.00203, val_loss: 0.00252, lr: 1.39E-04, _patience: 10
Epoch: 8 | train_loss: 0.00186, val_loss: 0.00242, lr: 1.39E-04, _patience: 10
Epoch: 9 | train_loss: 0.00163, val_loss: 0.00239, lr: 1.39E-04, _patience: 10
Epoch: 10 | train_loss: 0.00147, val_loss: 0.00230, lr: 1.39E-04, _patience: 10
Epoch: 11 | train_loss: 0.00131, val_loss: 0.00235, lr: 1.39E-04, _patience: 9
Epoch: 12 | train_loss: 0.00118, val_loss: 0.00235, lr: 1.39E-04, _patience: 8
Epoch: 13 | train_loss: 0.00108, val_loss: 0.00239,

[32m[I 2021-09-02 02:19:43,708][0m Trial 43 finished with value: 0.6681759892980845 and parameters: {'embedding_dim': 465, 'num_filters': 455, 'hidden_dim': 299, 'dropout_p': 0.324743395619151, 'lr': 0.00013947514801406982}. Best is trial 39 with value: 0.6911921640685477.[0m


Epoch: 1 | train_loss: 0.00970, val_loss: 0.00576, lr: 2.91E-04, _patience: 10
Epoch: 2 | train_loss: 0.00559, val_loss: 0.00394, lr: 2.91E-04, _patience: 10
Epoch: 3 | train_loss: 0.00434, val_loss: 0.00347, lr: 2.91E-04, _patience: 10
Epoch: 4 | train_loss: 0.00358, val_loss: 0.00312, lr: 2.91E-04, _patience: 10
Epoch: 5 | train_loss: 0.00318, val_loss: 0.00278, lr: 2.91E-04, _patience: 10
Epoch: 6 | train_loss: 0.00280, val_loss: 0.00259, lr: 2.91E-04, _patience: 10
Epoch: 7 | train_loss: 0.00246, val_loss: 0.00247, lr: 2.91E-04, _patience: 10
Epoch: 8 | train_loss: 0.00221, val_loss: 0.00238, lr: 2.91E-04, _patience: 10
Epoch: 9 | train_loss: 0.00195, val_loss: 0.00228, lr: 2.91E-04, _patience: 10
Epoch: 10 | train_loss: 0.00178, val_loss: 0.00222, lr: 2.91E-04, _patience: 10
Epoch: 11 | train_loss: 0.00156, val_loss: 0.00231, lr: 2.91E-04, _patience: 9
Epoch: 12 | train_loss: 0.00132, val_loss: 0.00217, lr: 2.91E-04, _patience: 10
Epoch: 13 | train_loss: 0.00118, val_loss: 0.00213

[32m[I 2021-09-02 02:21:24,331][0m Trial 44 finished with value: 0.6817510048213098 and parameters: {'embedding_dim': 396, 'num_filters': 489, 'hidden_dim': 339, 'dropout_p': 0.6707874722911259, 'lr': 0.00029145359287980455}. Best is trial 39 with value: 0.6911921640685477.[0m


Epoch: 1 | train_loss: 0.00653, val_loss: 0.00575, lr: 1.00E-04, _patience: 10
Epoch: 2 | train_loss: 0.00479, val_loss: 0.00389, lr: 1.00E-04, _patience: 10
Epoch: 3 | train_loss: 0.00403, val_loss: 0.00367, lr: 1.00E-04, _patience: 10
Epoch: 4 | train_loss: 0.00362, val_loss: 0.00343, lr: 1.00E-04, _patience: 10
Epoch: 5 | train_loss: 0.00329, val_loss: 0.00319, lr: 1.00E-04, _patience: 10
Epoch: 6 | train_loss: 0.00294, val_loss: 0.00298, lr: 1.00E-04, _patience: 10
Epoch: 7 | train_loss: 0.00273, val_loss: 0.00280, lr: 1.00E-04, _patience: 10
Epoch: 8 | train_loss: 0.00245, val_loss: 0.00265, lr: 1.00E-04, _patience: 10
Epoch: 9 | train_loss: 0.00231, val_loss: 0.00255, lr: 1.00E-04, _patience: 10
Epoch: 10 | train_loss: 0.00208, val_loss: 0.00244, lr: 1.00E-04, _patience: 10
Epoch: 11 | train_loss: 0.00191, val_loss: 0.00240, lr: 1.00E-04, _patience: 10
Epoch: 12 | train_loss: 0.00177, val_loss: 0.00233, lr: 1.00E-04, _patience: 10
Epoch: 13 | train_loss: 0.00166, val_loss: 0.0022

[32m[I 2021-09-02 02:23:30,085][0m Trial 45 finished with value: 0.6797041869613061 and parameters: {'embedding_dim': 442, 'num_filters': 508, 'hidden_dim': 258, 'dropout_p': 0.41207090925279083, 'lr': 0.00010034570989433325}. Best is trial 39 with value: 0.6911921640685477.[0m


Epoch: 1 | train_loss: 0.00743, val_loss: 0.00499, lr: 2.57E-04, _patience: 10
Epoch: 2 | train_loss: 0.00452, val_loss: 0.00367, lr: 2.57E-04, _patience: 10
Epoch: 3 | train_loss: 0.00353, val_loss: 0.00315, lr: 2.57E-04, _patience: 10
Epoch: 4 | train_loss: 0.00288, val_loss: 0.00278, lr: 2.57E-04, _patience: 10
Epoch: 5 | train_loss: 0.00240, val_loss: 0.00255, lr: 2.57E-04, _patience: 10
Epoch: 6 | train_loss: 0.00197, val_loss: 0.00235, lr: 2.57E-04, _patience: 10
Epoch: 7 | train_loss: 0.00168, val_loss: 0.00237, lr: 2.57E-04, _patience: 9
Epoch: 8 | train_loss: 0.00145, val_loss: 0.00224, lr: 2.57E-04, _patience: 10
Epoch: 9 | train_loss: 0.00126, val_loss: 0.00225, lr: 2.57E-04, _patience: 9
Epoch: 10 | train_loss: 0.00107, val_loss: 0.00227, lr: 2.57E-04, _patience: 8
Epoch: 11 | train_loss: 0.00094, val_loss: 0.00246, lr: 2.57E-04, _patience: 7
Epoch: 12 | train_loss: 0.00086, val_loss: 0.00232, lr: 2.57E-04, _patience: 6
Epoch: 13 | train_loss: 0.00072, val_loss: 0.00245, lr

[32m[I 2021-09-02 02:24:47,025][0m Trial 46 finished with value: 0.6682188148851675 and parameters: {'embedding_dim': 469, 'num_filters': 441, 'hidden_dim': 278, 'dropout_p': 0.3878453021896208, 'lr': 0.00025700347863230643}. Best is trial 39 with value: 0.6911921640685477.[0m


Epoch: 1 | train_loss: 0.00733, val_loss: 0.00564, lr: 1.92E-04, _patience: 10
Epoch: 2 | train_loss: 0.00506, val_loss: 0.00388, lr: 1.92E-04, _patience: 10
Epoch: 3 | train_loss: 0.00402, val_loss: 0.00361, lr: 1.92E-04, _patience: 10
Epoch: 4 | train_loss: 0.00356, val_loss: 0.00315, lr: 1.92E-04, _patience: 10
Epoch: 5 | train_loss: 0.00302, val_loss: 0.00288, lr: 1.92E-04, _patience: 10
Epoch: 6 | train_loss: 0.00267, val_loss: 0.00265, lr: 1.92E-04, _patience: 10
Epoch: 7 | train_loss: 0.00236, val_loss: 0.00256, lr: 1.92E-04, _patience: 10
Epoch: 8 | train_loss: 0.00212, val_loss: 0.00245, lr: 1.92E-04, _patience: 10
Epoch: 9 | train_loss: 0.00185, val_loss: 0.00236, lr: 1.92E-04, _patience: 10
Epoch: 10 | train_loss: 0.00171, val_loss: 0.00231, lr: 1.92E-04, _patience: 10
Epoch: 11 | train_loss: 0.00153, val_loss: 0.00223, lr: 1.92E-04, _patience: 10
Epoch: 12 | train_loss: 0.00137, val_loss: 0.00229, lr: 1.92E-04, _patience: 9
Epoch: 13 | train_loss: 0.00125, val_loss: 0.00227

[32m[I 2021-09-02 02:26:12,666][0m Trial 47 finished with value: 0.657470871060554 and parameters: {'embedding_dim': 375, 'num_filters': 463, 'hidden_dim': 219, 'dropout_p': 0.4436243605638399, 'lr': 0.00019199048654475534}. Best is trial 39 with value: 0.6911921640685477.[0m


Epoch: 1 | train_loss: 0.00752, val_loss: 0.00551, lr: 2.32E-04, _patience: 10
Epoch: 2 | train_loss: 0.00459, val_loss: 0.00366, lr: 2.32E-04, _patience: 10
Epoch: 3 | train_loss: 0.00375, val_loss: 0.00338, lr: 2.32E-04, _patience: 10
Epoch: 4 | train_loss: 0.00309, val_loss: 0.00297, lr: 2.32E-04, _patience: 10
Epoch: 5 | train_loss: 0.00263, val_loss: 0.00274, lr: 2.32E-04, _patience: 10
Epoch: 6 | train_loss: 0.00224, val_loss: 0.00257, lr: 2.32E-04, _patience: 10
Epoch: 7 | train_loss: 0.00188, val_loss: 0.00251, lr: 2.32E-04, _patience: 10
Epoch: 8 | train_loss: 0.00171, val_loss: 0.00239, lr: 2.32E-04, _patience: 10
Epoch: 9 | train_loss: 0.00147, val_loss: 0.00239, lr: 2.32E-04, _patience: 10
Epoch: 10 | train_loss: 0.00132, val_loss: 0.00225, lr: 2.32E-04, _patience: 10
Epoch: 11 | train_loss: 0.00114, val_loss: 0.00229, lr: 2.32E-04, _patience: 9
Epoch: 12 | train_loss: 0.00102, val_loss: 0.00230, lr: 2.32E-04, _patience: 8
Epoch: 13 | train_loss: 0.00089, val_loss: 0.00229,

[32m[I 2021-09-02 02:27:38,946][0m Trial 48 finished with value: 0.670257710368399 and parameters: {'embedding_dim': 413, 'num_filters': 512, 'hidden_dim': 182, 'dropout_p': 0.3509363167214768, 'lr': 0.00023231176297708252}. Best is trial 39 with value: 0.6911921640685477.[0m


Epoch: 1 | train_loss: 0.00624, val_loss: 0.00504, lr: 1.72E-04, _patience: 10
Epoch: 2 | train_loss: 0.00419, val_loss: 0.00392, lr: 1.72E-04, _patience: 10
Epoch: 3 | train_loss: 0.00365, val_loss: 0.00362, lr: 1.72E-04, _patience: 10
Epoch: 4 | train_loss: 0.00322, val_loss: 0.00320, lr: 1.72E-04, _patience: 10
Epoch: 5 | train_loss: 0.00281, val_loss: 0.00291, lr: 1.72E-04, _patience: 10
Epoch: 6 | train_loss: 0.00248, val_loss: 0.00268, lr: 1.72E-04, _patience: 10
Epoch: 7 | train_loss: 0.00219, val_loss: 0.00250, lr: 1.72E-04, _patience: 10
Epoch: 8 | train_loss: 0.00192, val_loss: 0.00242, lr: 1.72E-04, _patience: 10
Epoch: 9 | train_loss: 0.00174, val_loss: 0.00235, lr: 1.72E-04, _patience: 10
Epoch: 10 | train_loss: 0.00157, val_loss: 0.00231, lr: 1.72E-04, _patience: 10
Epoch: 11 | train_loss: 0.00141, val_loss: 0.00224, lr: 1.72E-04, _patience: 10
Epoch: 12 | train_loss: 0.00124, val_loss: 0.00224, lr: 1.72E-04, _patience: 10
Epoch: 13 | train_loss: 0.00114, val_loss: 0.0022

[32m[I 2021-09-02 02:28:53,014][0m Trial 49 finished with value: 0.6630354636692574 and parameters: {'embedding_dim': 433, 'num_filters': 212, 'hidden_dim': 379, 'dropout_p': 0.301335143495687, 'lr': 0.00017175757149907504}. Best is trial 39 with value: 0.6911921640685477.[0m


In [None]:
from pyngrok import ngrok

In [None]:
# MLFlow dashboard
get_ipython().system_raw("mlflow server -h 0.0.0.0 -p 5000 --backend-store-uri $PWD/experiments/ &")
ngrok.kill()
ngrok.set_auth_token("")
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)

MLflow Tracking UI: https://9d79-35-239-89-63.ngrok.io


In [None]:
# All trials
trials_df = study.trials_dataframe()
trials_df = trials_df.sort_values(["value"], ascending=False)  # sort by metric
trials_df.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_dropout_p,params_embedding_dim,params_hidden_dim,params_lr,params_num_filters,user_attrs_f1,user_attrs_precision,user_attrs_recall,user_attrs_threshold,state
39,39,0.691192,2021-09-02 02:10:31.581366,2021-09-02 02:11:49.137872,0 days 00:01:17.556506,0.34552,407,334,0.000284,510,0.691192,0.83525,0.607539,0.281473,COMPLETE
23,23,0.689361,2021-09-02 01:53:21.974442,2021-09-02 01:54:24.107666,0 days 00:01:02.133224,0.392532,231,471,0.000318,476,0.689361,0.846572,0.600887,0.337744,COMPLETE
1,1,0.68785,2021-09-02 01:19:13.448726,2021-09-02 01:20:30.698038,0 days 00:01:17.249312,0.466795,182,496,0.000194,486,0.68785,0.851291,0.594235,0.3263,COMPLETE
40,40,0.687214,2021-09-02 02:11:49.155965,2021-09-02 02:13:32.089228,0 days 00:01:42.933263,0.333406,422,323,0.000172,496,0.687214,0.82072,0.614191,0.27545,COMPLETE
26,26,0.687146,2021-09-02 01:56:15.237041,2021-09-02 01:57:10.125590,0 days 00:00:54.888549,0.364982,240,485,0.000425,504,0.687146,0.838968,0.600887,0.317778,COMPLETE


In [None]:
# Best trial
print (f"Best value (val loss): {study.best_trial.value}")
print (f"Best hyperparameters: {study.best_trial.params}")

Best value (val loss): 0.6911921640685477
Best hyperparameters: {'embedding_dim': 407, 'num_filters': 510, 'hidden_dim': 334, 'dropout_p': 0.3455200387912171, 'lr': 0.00028381065170221184}


In [None]:
# Save best parameters
params = {**args.__dict__, **study.best_trial.params}
params["threshold"] = study.best_trial.user_attrs["threshold"]
print (json.dumps(params, indent=2, cls=NumpyEncoder))

{
  "char_level": true,
  "filter_sizes": [
    1,
    2,
    3,
    4,
    5,
    6,
    7,
    8,
    9,
    10
  ],
  "batch_size": 64,
  "embedding_dim": 407,
  "num_filters": 510,
  "hidden_dim": 334,
  "dropout_p": 0.3455200387912171,
  "lr": 0.00028381065170221184,
  "num_epochs": 100,
  "patience": 10,
  "threshold": 0.2814725935459137
}
