# Importing Libraries

In [2]:
pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
pip install better_profanity

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
%pip install torch transformers datasets ipywidgets flaml[blendsearch,ray] #It is important to run this before installing datasets, due to compatibility issues

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
#Importing all necessary packages and libraries

import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import emoji
import string
import random
import time
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
import torch
from tqdm import tqdm
import spacy
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from better_profanity import profanity

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
nlp = spacy.blank("en")
%matplotlib inline

### Define Path

In [8]:
MODEL_PATH = "/content/drive/My Drive/ML4NLP_Assignment5/model/" #That's where we will store the model 
DATA_PATH = "/content/drive/My Drive/ML4NLP_Assignment5/data/" #That's where we will store the data

### Loading Dataset and Mounting Drive

In [9]:
# MOUNTING GOOGLE DRIVE

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [10]:
# FUNCTION FOR LOADING DATA

def load_data(data_path, file_name, label=False):
    data_path = data_path + file_name
    df = pd.read_csv(data_path, on_bad_lines="skip", sep = "\t")
    X = df["Tweet"]
    y = df["Intensity Score"]
    return X, y

In [12]:
# LOADING DATA

X_train_raw, y_train_raw = load_data(DATA_PATH, "EI-reg-En-anger-train.txt")   #raw train dataset
X_test_raw, y_test_raw = load_data(DATA_PATH, "2018-EI-reg-En-anger-test-gold.txt") #raw test dataset

### Data Cleaning & Tokenising

In [14]:
#Class for data cleaning and pre-processing 

class DataCleaning:
    def __init__(self):
        return
    # LOWERCASING
    def convert_lowercase(self, text):
        text = text.str.lower()
        return text
    # REMOVING HTML TAGS AND URLS
    def remove_html_url(self, text):
        re_html = re.compile("<.*?>")
        re_url = re.compile("https?://\S+|www\.\S+")
        re_user = re.compile("@\w+")
        wo_html_str = re_html.sub(r"", text)
        wo_url_str = re_url.sub("URL", wo_html_str)
        wo_user_str = re_user.sub("USER", wo_url_str)
        return wo_user_str
    # REPLACING SWEAR WORDS
    def replace_swear_words(self, text):
        text = profanity.censor(text, "-")
        return text
    # REMOVING PUNCTUATIONS & NUMBERS
    def remove_punctuation_number(self, text):
        text = text.str.replace('[{}]'.format(string.punctuation), "", regex=True)
        text = text.str.replace("\d+", "", regex=True)
        return text 
    # LEMMATISATION
    def lemmatisation(self, text):
        lemmatiser = WordNetLemmatizer()
        return lemmatiser.lemmatize(text)
    # TOKENISING
    def tokenise(self, text):
        tokenizer = TweetTokenizer()
        text = tokenizer.tokenize(text)
        return text
    # REPLACING EMOJIS TO TEXTS
    def demojise(self, text):
        text = [emoji.demojize(word)[1:-1] if emoji.demojize(word) != word else word for word in text]
        return text
    # REPLACING SWEAR WORDS TO ANGRY
    def replace_swear_word_to_angry(self, text):
        text = text.replace("----", "angry")
        return text
    # APPLYING ALL METHODS
    def clean(self, X):
        res = self.convert_lowercase(X)
        res = res.apply(self.remove_html_url)
        res = self.remove_punctuation_number(res)
        #res = res.apply(self.replace_swear_words) - we removed this step in the end due to better results without it 
        #res = res.apply(self.replace_swear_word_to_angry) - we removed this step in the end due to better results without it 
        res = res.apply(self.lemmatisation)
        res = res.apply(self.tokenise)
        res = res.apply(self.demojise)
        return res

In [15]:
#Applying the class defined above to clean and pre-process the data 
data_cleaning = DataCleaning()
X_train = data_cleaning.clean(X_train_raw) #applying to train dataset
X_test = data_cleaning.clean(X_test_raw) #applying to test dataset

# Save Pre-Processed Data

In [16]:
#Checking what type is our data 
type(X_train)

pandas.core.series.Series

In [17]:
#We convert it to pandas dataframe
X_train = X_train.to_frame()
X_test = X_test.to_frame()

In [18]:
#Just making sure what is the data format 
X_train.head()

Unnamed: 0,Tweet
0,"[USER, USER, shut, up, hashtags, are, cool, of..."
1,"[it, makes, me, so, fucking, irate, jesus, nob..."
2,"[lol, adam, the, bull, with, his, fake, outrage]"
3,"[USER, passed, away, early, this, morning, in,..."
4,"[USER, lol, wow, i, was, gonna, say, really, h..."


In [19]:
#We don't need to pre-process the labels 
y_train = y_train_raw
y_test = y_test_raw

In [20]:
#We convert the labels to dataframe
y_train = y_train.to_frame()
y_test = y_test.to_frame()

In [21]:
#Saving the pre-processed data
X_train.to_csv("/content/drive/My Drive/ML4NLP_Assignment5/X_train_2.csv")
X_test.to_csv("/content/drive/My Drive/ML4NLP_Assignment5/X_test_2.csv")
y_train.to_csv("/content/drive/My Drive/ML4NLP_Assignment5/y_train_2.csv")
y_test.to_csv("/content/drive/My Drive/ML4NLP_Assignment5/y_test_2.csv")

# Load the pre-processed data

In [22]:
#Loading the pre-processed data
X_train = pd.read_csv("/content/drive/My Drive/ML4NLP_Assignment5/X_train_2.csv")
X_test = pd.read_csv("/content/drive/My Drive/ML4NLP_Assignment5/X_test_2.csv")
y_train = pd.read_csv("/content/drive/My Drive/ML4NLP_Assignment5/y_train_2.csv")
y_test = pd.read_csv("/content/drive/My Drive/ML4NLP_Assignment5/y_test_2.csv")

In [23]:
X_train.drop(columns=["Unnamed: 0"])

Unnamed: 0,Tweet
0,"['USER', 'USER', 'shut', 'up', 'hashtags', 'ar..."
1,"['it', 'makes', 'me', 'so', 'fucking', 'irate'..."
2,"['lol', 'adam', 'the', 'bull', 'with', 'his', ..."
3,"['USER', 'passed', 'away', 'early', 'this', 'm..."
4,"['USER', 'lol', 'wow', 'i', 'was', 'gonna', 's..."
...,...
1696,"['got', 'a', 'tip', 'from', 'a', 'drunk', 'ube..."
1697,"['USER', 'USER', 'USER', 'USER', 'fucker', 'bl..."
1698,"['USER', 'i', 'look', 'rabid']"
1699,"['USER', 'im', 'not', 'surprised', 'i', 'would..."


In [24]:
#We are dropping the extra column which is empty
X_train = X_train.drop(columns=["Unnamed: 0"])
X_test = X_test.drop(columns=["Unnamed: 0"])
y_train = y_train.drop(columns=["Unnamed: 0"])
y_test = y_test.drop(columns=["Unnamed: 0"])

# Data formating 

Concatenate frames - We want to keep the tweets and labels in the same dataset object later 

In [25]:
frames = [X_train, y_train]

In [26]:
train_data = pd.concat(frames, axis=1) #Concatenating tweets and labels for the train dataset in one dataframe

In [27]:
frames_test = [X_test, y_test]

In [28]:
test_data = pd.concat(frames_test, axis=1) #Concatenating tweets and labels for the test dataset in one dataframe

In [29]:
train_data.head()

Unnamed: 0,Tweet,Intensity Score
0,"['USER', 'USER', 'shut', 'up', 'hashtags', 'ar...",0.562
1,"['it', 'makes', 'me', 'so', 'fucking', 'irate'...",0.75
2,"['lol', 'adam', 'the', 'bull', 'with', 'his', ...",0.417
3,"['USER', 'passed', 'away', 'early', 'this', 'm...",0.354
4,"['USER', 'lol', 'wow', 'i', 'was', 'gonna', 's...",0.438


In [30]:
test_data.head()

Unnamed: 0,Tweet,Intensity Score
0,"['USER', 'i', 'know', 'you', 'mean', 'well', '...",0.734
1,"['let', 'go', 'of', 'resentment', 'it', 'will'...",0.422
2,"['no', 'im', 'not', 'depressed', 'because', 'o...",0.663
3,"['amarnathterrorattack', 'muslims', 'are', 'ki...",0.703
4,"['prepare', 'to', 'suffer', 'the', 'sting', 'o...",0.719


In [31]:
#Renaming columns in our train dataframe
train_data.rename(columns = {'Tweet':'text'}, inplace = True)

In [32]:
train_data.rename(columns = {'Intensity Score':'label'}, inplace = True)

In [33]:
#Checking the end result
train_data.head()

Unnamed: 0,text,label
0,"['USER', 'USER', 'shut', 'up', 'hashtags', 'ar...",0.562
1,"['it', 'makes', 'me', 'so', 'fucking', 'irate'...",0.75
2,"['lol', 'adam', 'the', 'bull', 'with', 'his', ...",0.417
3,"['USER', 'passed', 'away', 'early', 'this', 'm...",0.354
4,"['USER', 'lol', 'wow', 'i', 'was', 'gonna', 's...",0.438


In [34]:
#Renaming columns in our test dataframe
test_data.rename(columns = {'Tweet':'text'}, inplace = True)
test_data.rename(columns = {'Intensity Score':'label'}, inplace = True)

In [35]:
#Checking the end result
test_data.head()

Unnamed: 0,text,label
0,"['USER', 'i', 'know', 'you', 'mean', 'well', '...",0.734
1,"['let', 'go', 'of', 'resentment', 'it', 'will'...",0.422
2,"['no', 'im', 'not', 'depressed', 'because', 'o...",0.663
3,"['amarnathterrorattack', 'muslims', 'are', 'ki...",0.703
4,"['prepare', 'to', 'suffer', 'the', 'sting', 'o...",0.719


Splitting train_data into train_data and val_data for hyperparametres tuning

In [36]:
#first shuffle the data randomly 
shuffled_train = train_data.sample(frac=1)

In [37]:
len(shuffled_train)

1701

In [38]:
# Splitting the train dataset into validation dataset and new train datatset
# approximately 15% of the initial train dataset will go to the validation dataset
train_data = shuffled_train[:1446]
val_data = shuffled_train[1446:]

In [39]:
#Importing more libraries and packages
import transformers
from datasets import Dataset,load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [40]:
#Getting the overview of the current training data format 
train_data.head()

Unnamed: 0,text,label
1677,"['a', 'USER', 'not', 'turning', 'up', 'why', '...",0.688
635,"['firsttweetever', 'sippin', 'hotchocolate', '...",0.542
1049,"['im', 'just', 'doing', 'what', 'u', 'should',...",0.312
459,"['USER', 'USER', 'what', 'cant', 'you', 'grasp...",0.75
422,"['USER', 'hi', 'monica', 'i', 'write', 'regula...",0.375


In [41]:
train_data.loc[738]

text     ['ellie', 'just', 'gave', 'me', 'loads', 'of',...
label                                                0.146
Name: 738, dtype: object

Convert train, val, and test datasets from Dataframe to Dataset Object

In [42]:
# Coverting dataframes to dataset objects 
train_dataset = Dataset.from_pandas(train_data, preserve_index=False)
test_dataset = Dataset.from_pandas(test_data, preserve_index=False)
val_dataset = Dataset.from_pandas(val_data, preserve_index=False)

In [43]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1446
})

In [44]:
test_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1002
})

In [45]:
val_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 255
})

In [46]:
val_dataset[2]

{'text': "['good', 'luck', 'to', 'all', 'furyhaney', 'players', 'playing', 'this', 'weekend', 'at', 'the', 'future', 'stars', 'showcase', 'in', 'frisco', 'tx', 'kgb', 'gtown', 'fury']",
 'label': 0.396}

In [47]:
type(val_dataset[2]['text'])

str

Above we can see that what should be a list, is represented as a string in our Datasets object. Hence, we fix it below:

In [48]:
import ast

def fix_dataset(example):
  example['text'] = ast.literal_eval(example['text'])
  return example

In [49]:
val_dataset_fixed = val_dataset.map(fix_dataset)

  0%|          | 0/255 [00:00<?, ?ex/s]

In [50]:
val_dataset_fixed[2]

{'text': ['good',
  'luck',
  'to',
  'all',
  'furyhaney',
  'players',
  'playing',
  'this',
  'weekend',
  'at',
  'the',
  'future',
  'stars',
  'showcase',
  'in',
  'frisco',
  'tx',
  'kgb',
  'gtown',
  'fury'],
 'label': 0.396}

In [51]:
train_dataset_fixed = train_dataset.map(fix_dataset)

  0%|          | 0/1446 [00:00<?, ?ex/s]

In [52]:
test_dataset_fixed = test_dataset.map(fix_dataset)

  0%|          | 0/1002 [00:00<?, ?ex/s]

In [53]:
train_dataset_fixed[2]['text']

['im',
 'just',
 'doing',
 'what',
 'u',
 'should',
 'b',
 'doing',
 'just',
 'minding',
 'my',
 'business',
 'and',
 'grinding',
 'relentless',
 'USER']

De-tokenize.
We already tokenized the dataset in the pre-processing step, but then we decided to use the HuggingFace Tokenizer instead. So we join the words again

In [54]:
#Function to join the words into one string per twit
def join_tweets(example):
  example['text'] = " ".join(example['text'])
  return example

In [55]:
#Applying this to the validation dataset
val_dataset_join = val_dataset_fixed.map(join_tweets)

  0%|          | 0/255 [00:00<?, ?ex/s]

In [56]:
val_dataset_join[2]

{'text': 'good luck to all furyhaney players playing this weekend at the future stars showcase in frisco tx kgb gtown fury',
 'label': 0.396}

In [57]:
#Applying this to the train and test datasets
train_dataset_join = train_dataset_fixed.map(join_tweets)
test_dataset_join = test_dataset_fixed.map(join_tweets)

  0%|          | 0/1446 [00:00<?, ?ex/s]

  0%|          | 0/1002 [00:00<?, ?ex/s]

In [58]:
train_dataset_join[2]

{'text': 'im just doing what u should b doing just minding my business and grinding relentless USER',
 'label': 0.312}

In [59]:
lenghts_train = []

for a in range(len(train_dataset_join)):
  txt = train_dataset_join[a]['text']
  lenghts_train.append(len(txt))

In [60]:
#Check what is the length of the longest string/tweet
max(lenghts_train)

268

Tokenize the pre-trained model

In [61]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [62]:
len(tokenizer)

30522

In [63]:
#we want to padd all tweets to the length of the longest one
max_length = 268

In [64]:
#Now we tokenize the tweets again
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset_join.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset_join.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset_join.map(tokenize_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [65]:
#Download the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifi

In [66]:
#Re-size the pre-trained model to the len of the tokenizer
model.resize_token_embeddings(len(tokenizer))

Embedding(30522, 768, padding_idx=0)

In [67]:
from sklearn.metrics import mean_squared_error

In [68]:
import scipy.stats

In [69]:
#Define the function to compute the metrics 
from datasets import load_metric

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    r, p = scipy.stats.pearsonr(labels, predictions)
    return {"rmse": rmse, "pearsonr": r}

# First model

In [70]:
#Define the training args 
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir="test_trainer",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=3,
                                  save_total_limit = 2,
                                  save_strategy = 'no',
                                  load_best_model_at_end=False
                                  )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
#Train the first model
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1701
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 321
  Number of trainable parameters = 66954241


Epoch,Training Loss,Validation Loss,Rmse
1,0.0142,0.023599,0.153618
2,0.0113,0.018607,0.136407
3,0.0063,0.0188,0.137114


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1002
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1002
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 

TrainOutput(global_step=321, training_loss=0.010591309389964072, metrics={'train_runtime': 77.4272, 'train_samples_per_second': 65.907, 'train_steps_per_second': 4.146, 'total_flos': 675969080177664.0, 'train_loss': 0.010591309389964072, 'epoch': 3.0})

# Second model

In [70]:
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error

In [71]:
import numpy as np

In [72]:
#Different version of the function to compute the metrics 
def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)
    

    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}

In [None]:
#Training arguments for the second model
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir="test_trainer",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=3,
                                  save_total_limit = 2,
                                  save_strategy = 'no',
                                  load_best_model_at_end=False
                                  )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics_for_regression
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
#Training the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1701
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 321
  Number of trainable parameters = 66954241


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.0077,0.021191,0.021191,0.145572,0.114679,0.409509,24.137257
2,0.0061,0.01965,0.01965,0.140178,0.112058,0.452464,23.653032
3,0.0039,0.0185,0.0185,0.136013,0.107576,0.484511,22.893147


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1002
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1002
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 

TrainOutput(global_step=321, training_loss=0.005878259274075707, metrics={'train_runtime': 77.9786, 'train_samples_per_second': 65.441, 'train_steps_per_second': 4.117, 'total_flos': 675969080177664.0, 'train_loss': 0.005878259274075707, 'epoch': 3.0})

In [None]:
#Evaluate the model
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1002
  Batch size = 16


{'eval_loss': 0.018499672412872314,
 'eval_mse': 0.018499670550227165,
 'eval_rmse': 0.13601349294185638,
 'eval_mae': 0.10757619887590408,
 'eval_r2': 0.484510977098937,
 'eval_smape': 22.893147299151696,
 'eval_runtime': 4.3911,
 'eval_samples_per_second': 228.191,
 'eval_steps_per_second': 14.347,
 'epoch': 3.0}

# Third MODEL

In [None]:
#Arguments for the third model
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir="test_trainer",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=64,
                                  per_device_eval_batch_size=20,
                                  weight_decay = 0.01, 
                                  learning_rate = 2e-5,
                                  num_train_epochs=10,
                                  save_total_limit = 10,
                                  save_strategy = 'epoch',
                                  load_best_model_at_end=True
                                  )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics_for_regression
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
#Training the third model
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1701
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 270
  Number of trainable parameters = 66954241


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,0.0031,0.019845,0.019845,0.140873,0.112157,0.44702,23.871443
2,0.0034,0.018579,0.018579,0.136306,0.107555,0.482293,22.882148
3,0.003,0.01853,0.01853,0.136125,0.107245,0.483669,22.812531
4,0.0026,0.018952,0.018952,0.137665,0.109006,0.471917,23.212203
5,0.0025,0.018704,0.018704,0.136764,0.108045,0.478806,23.025921
6,0.0024,0.019109,0.019109,0.138237,0.109671,0.467523,23.328195
7,0.0024,0.018377,0.018377,0.13556,0.107272,0.487943,22.883645
8,0.0022,0.018475,0.018475,0.135922,0.107592,0.485203,22.961912
9,0.002,0.018683,0.018683,0.136684,0.108142,0.479413,23.060216
10,0.0021,0.018602,0.018602,0.136388,0.107823,0.481668,23.00045


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1002
  Batch size = 20
Saving model checkpoint to test_trainer/checkpoint-27
Configuration saved in test_trainer/checkpoint-27/config.json
Model weights saved in test_trainer/checkpoint-27/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1002
  Batch size = 20
Saving model checkpoint to test_trainer/checkpoint-54
Configuration saved in test_trainer/checkpoint-54/config.json
Model weights save

TrainOutput(global_step=270, training_loss=0.0025765578641935632, metrics={'train_runtime': 254.7842, 'train_samples_per_second': 66.762, 'train_steps_per_second': 1.06, 'total_flos': 2253230267258880.0, 'train_loss': 0.0025765578641935632, 'epoch': 10.0})

In [None]:
#Evaluate the third model
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1002
  Batch size = 20


{'eval_loss': 0.018376503139734268,
 'eval_mse': 0.018376503139734268,
 'eval_rmse': 0.13555996119976044,
 'eval_mae': 0.107271708548069,
 'eval_r2': 0.4879430731052734,
 'eval_smape': 22.883644819735526,
 'eval_runtime': 4.1423,
 'eval_samples_per_second': 241.895,
 'eval_steps_per_second': 12.312,
 'epoch': 10.0}

# Fourth Model (trying FLAML)

In [73]:
from transformers import TrainingArguments, Trainer

In [74]:
MODEL_CHECKPOINT = "distilbert-base-uncased"

In [75]:
#Download the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=1)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

In [None]:
#Define training arguments 
args = TrainingArguments(
    output_dir='output',
    do_eval=True,
)

In [None]:
#Define the trainer 
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_for_regression,
)

In [None]:
#Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1701
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 639
  Number of trainable parameters = 66954241
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.0186


Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
tokenizer config file saved in output/checkpoint-500/tokenizer_config.json
Special tokens file saved in output/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=639, training_loss=0.016356775271873892, metrics={'train_runtime': 74.3096, 'train_samples_per_second': 68.672, 'train_steps_per_second': 8.599, 'total_flos': 675969080177664.0, 'train_loss': 0.016356775271873892, 'epoch': 3.0})

In [None]:
#Evaluate the model
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1002
  Batch size = 8


{'eval_loss': 0.019569557160139084,
 'eval_mse': 0.019569557160139084,
 'eval_rmse': 0.13989123702049255,
 'eval_mae': 0.11179779469966888,
 'eval_r2': 0.45469886176708085,
 'eval_smape': 23.760578452470057,
 'eval_runtime': 4.3975,
 'eval_samples_per_second': 227.854,
 'eval_steps_per_second': 28.652,
 'epoch': 3.0}

In [None]:
#Install flaml
!pip install flaml

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#Import the flaml
import flaml

In [None]:
#Define the function for the ray tuning 

def train_distilbert(config: dict):

    

    train_dataset, eval_dataset = tokenized_train_dataset, tokenized_val_dataset #Pass the train and evaluation sets for tuning

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_CHECKPOINT, num_labels=1
    )

    def compute_metrics_for_regression(eval_pred): #Function to compute the metrics for evaluation 
        logits, labels = eval_pred
        labels = labels.reshape(-1, 1)

        mse = mean_squared_error(labels, logits)
        rmse = mean_squared_error(labels, logits, squared=False)
        mae = mean_absolute_error(labels, logits)
        r2 = r2_score(labels, logits)
        smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)

        return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}

    #Training args 
    training_args = TrainingArguments(
        output_dir='.',
        do_eval=False,
        disable_tqdm=True,
        logging_steps=20000,
        save_total_limit=0,
        **config,
    )
    #Trainer
    trainer = Trainer(
        model,
        training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_for_regression,
    )

    # train model
    trainer.train()

    # evaluate model
    eval_output = trainer.evaluate()

    # report the metric to optimize
    flaml.tune.report(
        r2=eval_output["eval_r2"],
    )

In [None]:
#Define the search space for ray tuning 
max_num_epoch = 64
search_space = {
        # You can mix constants with search space objects.
        "num_train_epochs": flaml.tune.loguniform(1, max_num_epoch),
        "learning_rate": flaml.tune.loguniform(1e-6, 1e-4),
        "adam_epsilon": flaml.tune.loguniform(1e-9, 1e-7),
        "adam_beta1": flaml.tune.uniform(0.8, 0.99),
        "adam_beta2": flaml.tune.loguniform(98e-2, 9999e-4),
}

In [None]:
# optimization objective
HP_METRIC, MODE = "r2", "max"

# resources
num_cpus = 0
num_gpus = 1

# constraints
num_samples = -1    # number of trials, -1 means unlimited
time_budget_s = 3600    # time budget in seconds

In [None]:
!pip install pickle5

In [None]:
import pickle5 as pickle

In [None]:
#Ray tuning 
import time
import ray
start_time = time.time()
ray.shutdown()
ray.init(num_cpus=num_cpus, num_gpus=num_gpus)

print("Tuning started...")
analysis = flaml.tune.run(
    train_distilbert,
    search_alg=flaml.CFO(
        space=search_space,
        metric=HP_METRIC,
        mode=MODE,
        low_cost_partial_config={"num_train_epochs": 1}),
    # uncomment the following if scheduler = 'asha',
    # max_resource=max_num_epoch, min_resource=1,
    resources_per_trial={"gpu": num_gpus, "cpu": num_cpus},
    local_dir='logs/',
    num_samples=num_samples,
    time_budget_s=time_budget_s,
    use_ray=True,
)

ray.shutdown()

Tuning started...


[2m[36m(train_distilbert pid=7468)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=7468)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=7468)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=7468)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=7468)[0m 
[2m[36m(train_distilbert pid=7468)[0m 
[2m[36m(train_distilbert pid=7468)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=7468)[0m 
[2m[36m(train_distilbert pid=7468)[0m 
[2m[36m(train_distilbert pid=7468)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=7468)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=7468)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=7468)[0m   Batch size = 8


[2m[36m(train_distilbert pid=7468)[0m {'train_runtime': 23.43, 'train_samples_per_second': 72.599, 'train_steps_per_second': 9.091, 'train_loss': 0.06773512576107688, 'epoch': 1.0}


Trial train_distilbert_75886418 reported r2=0.04 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.5662610420278344e-06, 'adam_epsilon': 1.9702991167906198e-08, 'adam_beta1': 0.8689337534108345, 'adam_beta2': 0.9872898093714128}.
Trial train_distilbert_75886418 completed. Last result: r2=0.03501930286920463
[2m[36m(train_distilbert pid=7468)[0m {'eval_loss': 0.03463085740804672, 'eval_mse': 0.03463085740804672, 'eval_rmse': 0.18609367311000824, 'eval_mae': 0.15341486036777496, 'eval_r2': 0.03501930286920463, 'eval_smape': 31.362470371756483, 'eval_runtime': 4.2255, 'eval_samples_per_second': 237.131, 'eval_steps_per_second': 29.819, 'epoch': 1.0}


[2m[36m(train_distilbert pid=7558)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
[2m[36m(train_distilbert pid=7558)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=7558)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=7558)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=7558)[0m {'train_runtime': 33.4159, 'train_samples_per_second': 73.519, 'train_steps_per_second': 9.217, 'train_loss': 0.023007295348427513, 'epoch': 1.45}


[2m[36m(train_distilbert pid=7558)[0m 
[2m[36m(train_distilbert pid=7558)[0m 
[2m[36m(train_distilbert pid=7558)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=7558)[0m 
[2m[36m(train_distilbert pid=7558)[0m 
[2m[36m(train_distilbert pid=7558)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=7558)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=7558)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=7558)[0m   Batch size = 8


Trial train_distilbert_797cc10e reported r2=0.42 with parameters={'num_train_epochs': 1.444265389543504, 'learning_rate': 3.2432004833442285e-05, 'adam_epsilon': 3.4102391893542775e-09, 'adam_beta1': 0.943680029695067, 'adam_beta2': 0.9802681933780513}.
Trial train_distilbert_797cc10e completed. Last result: r2=0.42094831344643224
[2m[36m(train_distilbert pid=7558)[0m {'eval_loss': 0.020780785009264946, 'eval_mse': 0.020780785009264946, 'eval_rmse': 0.14415541291236877, 'eval_mae': 0.11464690417051315, 'eval_r2': 0.42094831344643224, 'eval_smape': 24.019180389221553, 'eval_runtime': 4.2181, 'eval_samples_per_second': 237.548, 'eval_steps_per_second': 29.871, 'epoch': 1.45}


[2m[36m(train_distilbert pid=7656)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=7656)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=7656)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=7656)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=7656)[0m {'train_runtime': 23.5523, 'train_samples_per_second': 72.222, 'train_steps_per_second': 9.044, 'train_loss': 0.0513407568416685, 'epoch': 1.0}


[2m[36m(train_distilbert pid=7656)[0m 
[2m[36m(train_distilbert pid=7656)[0m 
[2m[36m(train_distilbert pid=7656)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=7656)[0m 
[2m[36m(train_distilbert pid=7656)[0m 
[2m[36m(train_distilbert pid=7656)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=7656)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=7656)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=7656)[0m   Batch size = 8


Trial train_distilbert_8c1c4f3c reported r2=0.01 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.469662593007818e-06, 'adam_epsilon': 3.577276599136122e-08, 'adam_beta1': 0.839249940775602, 'adam_beta2': 0.9891306099839907}.
Trial train_distilbert_8c1c4f3c completed. Last result: r2=0.005012244112677311
[2m[36m(train_distilbert pid=7656)[0m {'eval_loss': 0.0357077419757843, 'eval_mse': 0.0357077419757843, 'eval_rmse': 0.18896491825580597, 'eval_mae': 0.15632592141628265, 'eval_r2': 0.005012244112677311, 'eval_smape': 31.801034649451093, 'eval_runtime': 4.2213, 'eval_samples_per_second': 237.366, 'eval_steps_per_second': 29.848, 'epoch': 1.0}


[2m[36m(train_distilbert pid=7747)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
[2m[36m(train_distilbert pid=7747)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=7747)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=7747)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=7747)[0m 
[2m[36m(train_distilbert pid=7747)[0m 
[2m[36m(train_distilbert pid=7747)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=7747)[0m 
[2m[36m(train_distilbert pid=7747)[0m 
[2m[36m(train_distilbert pid=7747)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=7747)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=7747)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=7747)[0m   Batch size = 8


[2m[36m(train_distilbert pid=7747)[0m {'train_runtime': 24.3183, 'train_samples_per_second': 71.852, 'train_steps_per_second': 9.006, 'train_loss': 0.04933997376324379, 'epoch': 1.03}


Trial train_distilbert_a81fc376 reported r2=0.01 with parameters={'num_train_epochs': 1.027230096840913, 'learning_rate': 1.669208744541454e-06, 'adam_epsilon': 1.0852050441286482e-08, 'adam_beta1': 0.8986175660460669, 'adam_beta2': 0.9854524345419025}.
Trial train_distilbert_a81fc376 completed. Last result: r2=0.008964617345048276
[2m[36m(train_distilbert pid=7747)[0m {'eval_loss': 0.03556589409708977, 'eval_mse': 0.035565901547670364, 'eval_rmse': 0.18858924508094788, 'eval_mae': 0.156161829829216, 'eval_r2': 0.008964617345048276, 'eval_smape': 31.816616766467064, 'eval_runtime': 4.2488, 'eval_samples_per_second': 235.829, 'eval_steps_per_second': 29.655, 'epoch': 1.03}


[2m[36m(train_distilbert pid=7839)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=7839)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=7839)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=7839)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=7839)[0m {'train_runtime': 23.6504, 'train_samples_per_second': 71.923, 'train_steps_per_second': 9.006, 'train_loss': 0.05850101972409817, 'epoch': 1.0}


[2m[36m(train_distilbert pid=7839)[0m 
[2m[36m(train_distilbert pid=7839)[0m 
[2m[36m(train_distilbert pid=7839)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=7839)[0m 
[2m[36m(train_distilbert pid=7839)[0m 
[2m[36m(train_distilbert pid=7839)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=7839)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=7839)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=7839)[0m   Batch size = 8


[2m[36m(train_distilbert pid=7839)[0m {'eval_loss': 0.035196490585803986, 'eval_mse': 0.035196490585803986, 'eval_rmse': 0.1876072734594345, 'eval_mae': 0.15493591129779816, 'eval_r2': 0.0192580430217133, 'eval_smape': 31.622261336701595, 'eval_runtime': 4.235, 'eval_samples_per_second': 236.598, 'eval_steps_per_second': 29.752, 'epoch': 1.0}
Trial train_distilbert_be2eb96a reported r2=0.02 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.4140209531361843e-06, 'adam_epsilon': 1.7536031374142777e-08, 'adam_beta1': 0.8920528630228418, 'adam_beta2': 0.9872627816010259}.
Trial train_distilbert_be2eb96a completed. Last result: r2=0.0192580430217133


[2m[36m(train_distilbert pid=7932)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias']
[2m[36m(train_distilbert pid=7932)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=7932)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=7932)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=7932)[0m {'train_runtime': 49.7684, 'train_samples_per_second': 73.635, 'train_steps_per_second': 9.223, 'train_loss': 0.03394764544917088, 'epoch': 2.15}


[2m[36m(train_distilbert pid=7932)[0m 
[2m[36m(train_distilbert pid=7932)[0m 
[2m[36m(train_distilbert pid=7932)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=7932)[0m 
[2m[36m(train_distilbert pid=7932)[0m 
[2m[36m(train_distilbert pid=7932)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=7932)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=7932)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=7932)[0m   Batch size = 8


Trial train_distilbert_d4d3bcce reported r2=0.10 with parameters={'num_train_epochs': 2.1544304289135847, 'learning_rate': 1.7348920087309716e-06, 'adam_epsilon': 2.2137726186724874e-08, 'adam_beta1': 0.8458146437988271, 'adam_beta2': 0.9873168378817246}.
Trial train_distilbert_d4d3bcce completed. Last result: r2=0.10487100922115222
[2m[36m(train_distilbert pid=7932)[0m {'eval_loss': 0.03212404623627663, 'eval_mse': 0.03212404623627663, 'eval_rmse': 0.17923182249069214, 'eval_mae': 0.14788718521595, 'eval_r2': 0.10487100922115222, 'eval_smape': 30.29617327844311, 'eval_runtime': 4.2226, 'eval_samples_per_second': 237.295, 'eval_steps_per_second': 29.839, 'epoch': 2.15}


[2m[36m(train_distilbert pid=8035)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=8035)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=8035)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=8035)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=8035)[0m {'train_runtime': 23.576, 'train_samples_per_second': 72.15, 'train_steps_per_second': 9.035, 'train_loss': 0.03856433277398768, 'epoch': 1.0}


[2m[36m(train_distilbert pid=8035)[0m 
[2m[36m(train_distilbert pid=8035)[0m 
[2m[36m(train_distilbert pid=8035)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=8035)[0m 
[2m[36m(train_distilbert pid=8035)[0m 
[2m[36m(train_distilbert pid=8035)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=8035)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=8035)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=8035)[0m   Batch size = 8


Trial train_distilbert_eae45e74 reported r2=0.05 with parameters={'num_train_epochs': 1.0, 'learning_rate': 2.8163427403178784e-06, 'adam_epsilon': 3.9207909595692414e-08, 'adam_beta1': 0.8567877260295178, 'adam_beta2': 0.9885643540287904}.
Trial train_distilbert_eae45e74 completed. Last result: r2=0.050992060985764365
[2m[36m(train_distilbert pid=8035)[0m {'eval_loss': 0.034057632088661194, 'eval_mse': 0.034057632088661194, 'eval_rmse': 0.18454709649085999, 'eval_mae': 0.15268754959106445, 'eval_r2': 0.050992060985764365, 'eval_smape': 31.21524529066866, 'eval_runtime': 4.2513, 'eval_samples_per_second': 235.692, 'eval_steps_per_second': 29.638, 'epoch': 1.0}


[2m[36m(train_distilbert pid=8128)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight']
[2m[36m(train_distilbert pid=8128)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=8128)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=8128)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=8128)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=8128)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=8128)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=8128)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=8128)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=8128)[0m {'train_runtime': 64.4414, 'train_samples_per_second': 72.044, 'train_steps_per_second': 9.031, 'train_loss': 0.047887562886136506, 'epoch': 2.73}


[2m[36m(train_distilbert pid=8128)[0m 
[2m[36m(train_distilbert pid=8128)[0m 
[2m[36m(train_distilbert pid=8128)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=8128)[0m 
[2m[36m(train_distilbert pid=8128)[0m 
[2m[36m(train_distilbert pid=8128)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=8128)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=8128)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=8128)[0m   Batch size = 8


Trial train_distilbert_110ae6f4 reported r2=0.05 with parameters={'num_train_epochs': 2.729346084540195, 'learning_rate': 1e-06, 'adam_epsilon': 1.1124781403354568e-08, 'adam_beta1': 0.8579606711801437, 'adam_beta2': 0.9860439016325745}.
Trial train_distilbert_110ae6f4 completed. Last result: r2=0.04718795246414509
[2m[36m(train_distilbert pid=8128)[0m {'eval_loss': 0.03419415280222893, 'eval_mse': 0.03419415280222893, 'eval_rmse': 0.18491661548614502, 'eval_mae': 0.15265889465808868, 'eval_r2': 0.04718795246414509, 'eval_smape': 31.18629537799401, 'eval_runtime': 4.2272, 'eval_samples_per_second': 237.037, 'eval_steps_per_second': 29.807, 'epoch': 2.73}


[2m[36m(train_distilbert pid=8238)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
[2m[36m(train_distilbert pid=8238)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=8238)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=8238)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=8238)[0m {'train_runtime': 37.3171, 'train_samples_per_second': 72.721, 'train_steps_per_second': 9.111, 'train_loss': 0.0407498696271111, 'epoch': 1.6}


[2m[36m(train_distilbert pid=8238)[0m 
[2m[36m(train_distilbert pid=8238)[0m 
[2m[36m(train_distilbert pid=8238)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=8238)[0m 
[2m[36m(train_distilbert pid=8238)[0m 
[2m[36m(train_distilbert pid=8238)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=8238)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=8238)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=8238)[0m   Batch size = 8


Trial train_distilbert_27172e26 reported r2=0.14 with parameters={'num_train_epochs': 1.5953752206236405, 'learning_rate': 1.957090243343021e-06, 'adam_epsilon': 2.6795533436066e-08, 'adam_beta1': 0.8067505833264412, 'adam_beta2': 0.98752163528273}.
Trial train_distilbert_27172e26 completed. Last result: r2=0.1434618096308884
[2m[36m(train_distilbert pid=8238)[0m {'eval_loss': 0.030739111825823784, 'eval_mse': 0.030739115551114082, 'eval_rmse': 0.17532573640346527, 'eval_mae': 0.1446247547864914, 'eval_r2': 0.1434618096308884, 'eval_smape': 29.724667851796404, 'eval_runtime': 4.2285, 'eval_samples_per_second': 236.964, 'eval_steps_per_second': 29.798, 'epoch': 1.6}


[2m[36m(train_distilbert pid=8336)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
[2m[36m(train_distilbert pid=8336)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=8336)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=8336)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=8336)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=8336)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=8336)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=8336)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=8336)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=8336)[0m 
[2m[36m(train_distilbert pid=8336)[0m 
[2m[36m(train_distilbert pid=8336)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=8336)[0m 
[2m[36m(train_distilbert pid=8336)[0m 
[2m[36m(train_distilbert pid=8336)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=8336)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=8336)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=8336)[0m   Batch size = 8


[2m[36m(train_distilbert pid=8336)[0m {'train_runtime': 68.6448, 'train_samples_per_second': 72.094, 'train_steps_per_second': 9.032, 'train_loss': 0.0353679318581858, 'epoch': 2.91}


Trial train_distilbert_55961c9e reported r2=0.18 with parameters={'num_train_epochs': 2.9093911031251696, 'learning_rate': 1.537921049985556e-06, 'adam_epsilon': 1.8289575084882357e-08, 'adam_beta1': 0.884878704271213, 'adam_beta2': 0.9871120829526754}.
Trial train_distilbert_55961c9e completed. Last result: r2=0.18138100340150298
[2m[36m(train_distilbert pid=8336)[0m {'eval_loss': 0.02937828190624714, 'eval_mse': 0.029378285631537437, 'eval_rmse': 0.17140094935894012, 'eval_mae': 0.14056496322155, 'eval_r2': 0.18138100340150298, 'eval_smape': 28.974623019585827, 'eval_runtime': 4.2261, 'eval_samples_per_second': 237.097, 'eval_steps_per_second': 29.815, 'epoch': 2.91}


[2m[36m(train_distilbert pid=8448)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
[2m[36m(train_distilbert pid=8448)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=8448)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=8448)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=8448)[0m {'train_runtime': 26.6784, 'train_samples_per_second': 72.398, 'train_steps_per_second': 9.071, 'train_loss': 0.04458569298105792, 'epoch': 1.14}


[2m[36m(train_distilbert pid=8448)[0m 
[2m[36m(train_distilbert pid=8448)[0m 
[2m[36m(train_distilbert pid=8448)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=8448)[0m 
[2m[36m(train_distilbert pid=8448)[0m 
[2m[36m(train_distilbert pid=8448)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=8448)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=8448)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=8448)[0m   Batch size = 8


Trial train_distilbert_776b24c2 reported r2=0.04 with parameters={'num_train_epochs': 1.1354839623640351, 'learning_rate': 1.377985982487315e-06, 'adam_epsilon': 2.2780932367076146e-08, 'adam_beta1': 0.8, 'adam_beta2': 0.9852672845181559}.
Trial train_distilbert_776b24c2 completed. Last result: r2=0.03594359562128924
[2m[36m(train_distilbert pid=8448)[0m {'eval_loss': 0.034597691148519516, 'eval_mse': 0.03459768742322922, 'eval_rmse': 0.18600453436374664, 'eval_mae': 0.1535484939813614, 'eval_r2': 0.03594359562128924, 'eval_smape': 31.332922046531934, 'eval_runtime': 4.2281, 'eval_samples_per_second': 236.985, 'eval_steps_per_second': 29.801, 'epoch': 1.14}


[2m[36m(train_distilbert pid=8543)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=8543)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=8543)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=8543)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=8543)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=8543)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=8543)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=8543)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=8543)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=8543)[0m 
[2m[36m(train_distilbert pid=8543)[0m 
[2m[36m(train_distilbert pid=8543)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=8543)[0m 
[2m[36m(train_distilbert pid=8543)[0m 
[2m[36m(train_distilbert pid=8543)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=8543)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=8543)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=8543)[0m   Batch size = 8


[2m[36m(train_distilbert pid=8543)[0m {'train_runtime': 95.4931, 'train_samples_per_second': 72.814, 'train_steps_per_second': 9.121, 'train_loss': 0.02664896486547319, 'epoch': 4.09}


Trial train_distilbert_a4dfcc14 reported r2=0.34 with parameters={'num_train_epochs': 4.087746394379009, 'learning_rate': 2.1842386789201557e-06, 'adam_epsilon': 2.1512680553263282e-08, 'adam_beta1': 0.9139079137302566, 'adam_beta2': 0.9893706547269459}.
Trial train_distilbert_a4dfcc14 completed. Last result: r2=0.335473216903762
[2m[36m(train_distilbert pid=8543)[0m {'eval_loss': 0.0238482803106308, 'eval_mse': 0.0238482803106308, 'eval_rmse': 0.1544288843870163, 'eval_mae': 0.12380921840667725, 'eval_r2': 0.335473216903762, 'eval_smape': 25.83918101297405, 'eval_runtime': 4.2508, 'eval_samples_per_second': 235.722, 'eval_steps_per_second': 29.642, 'epoch': 4.09}


[2m[36m(train_distilbert pid=8671)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
[2m[36m(train_distilbert pid=8671)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=8671)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=8671)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=8671)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=8671)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=8671)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=8671)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=8671)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=8671)[0m 
[2m[36m(train_distilbert pid=8671)[0m 
[2m[36m(train_distilbert pid=8671)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=8671)[0m 
[2m[36m(train_distilbert pid=8671)[0m 
[2m[36m(train_distilbert pid=8671)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=8671)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=8671)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=8671)[0m   Batch size = 8


[2m[36m(train_distilbert pid=8671)[0m {'train_runtime': 79.828, 'train_samples_per_second': 72.5, 'train_steps_per_second': 9.082, 'train_loss': 0.02333970694706358, 'epoch': 3.4}


Trial train_distilbert_c0bc9a20 reported r2=0.37 with parameters={'num_train_epochs': 3.402431917111274, 'learning_rate': 2.4896521452703187e-06, 'adam_epsilon': 3.4589864160784254e-08, 'adam_beta1': 0.8705621951532071, 'adam_beta2': 0.9893611688020619}.
Trial train_distilbert_c0bc9a20 completed. Last result: r2=0.37043232342432475
[2m[36m(train_distilbert pid=8671)[0m {'eval_loss': 0.022593682631850243, 'eval_mse': 0.022593682631850243, 'eval_rmse': 0.15031194686889648, 'eval_mae': 0.11841098964214325, 'eval_r2': 0.37043232342432475, 'eval_smape': 24.712570951846306, 'eval_runtime': 4.2283, 'eval_samples_per_second': 236.977, 'eval_steps_per_second': 29.8, 'epoch': 3.4}


[2m[36m(train_distilbert pid=8789)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
[2m[36m(train_distilbert pid=8789)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=8789)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=8789)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=8789)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=8789)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=8789)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=8789)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=8789)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=8789)[0m 
[2m[36m(train_distilbert pid=8789)[0m 
[2m[36m(train_distilbert pid=8789)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=8789)[0m 
[2m[36m(train_distilbert pid=8789)[0m 
[2m[36m(train_distilbert pid=8789)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=8789)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=8789)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=8789)[0m   Batch size = 8


[2m[36m(train_distilbert pid=8789)[0m {'train_runtime': 81.9187, 'train_samples_per_second': 72.58, 'train_steps_per_second': 9.094, 'train_loss': 0.0362544526989828, 'epoch': 3.5}


Trial train_distilbert_fe496102 reported r2=0.17 with parameters={'num_train_epochs': 3.495397786456084, 'learning_rate': 1.3492594332447271e-06, 'adam_epsilon': 1.1374944533667017e-08, 'adam_beta1': 0.9282244228482626, 'adam_beta2': 0.9871215473134833}.
Trial train_distilbert_fe496102 completed. Last result: r2=0.17017370490443307
[2m[36m(train_distilbert pid=8789)[0m {'eval_loss': 0.029780486598610878, 'eval_mse': 0.029780486598610878, 'eval_rmse': 0.17257024347782135, 'eval_mae': 0.14184731245040894, 'eval_r2': 0.17017370490443307, 'eval_smape': 29.214976297405187, 'eval_runtime': 4.2511, 'eval_samples_per_second': 235.705, 'eval_steps_per_second': 29.64, 'epoch': 3.5}


[2m[36m(train_distilbert pid=8898)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=8898)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=8898)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=8898)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=8898)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=8898)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=8898)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=8898)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=8898)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=8898)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=8898)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=8898)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=8898)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=8898)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=8898)[0m 
[2m[36m(train_distilbert pid=8898)[0m 
[2m[36m(train_distilbert pid=8898)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=8898)[0m 
[2m[36m(train_distilbert pid=8898)[0m 
[2m[36m(train_distilbert pid=8898)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=8898)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=8898)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=8898)[0m   Batch size = 8


[2m[36m(train_distilbert pid=8898)[0m {'train_runtime': 130.2627, 'train_samples_per_second': 72.278, 'train_steps_per_second': 9.051, 'train_loss': 0.02206240355513478, 'epoch': 5.54}


Trial train_distilbert_3650b596 reported r2=0.41 with parameters={'num_train_epochs': 5.535083701422448, 'learning_rate': 3.7328719738996334e-06, 'adam_epsilon': 2.149747874774826e-08, 'adam_beta1': 0.8457864945600663, 'adam_beta2': 0.9899590904200131}.
Trial train_distilbert_3650b596 completed. Last result: r2=0.4128429144005116
[2m[36m(train_distilbert pid=8898)[0m {'eval_loss': 0.021071668714284897, 'eval_mse': 0.021071668714284897, 'eval_rmse': 0.14516083896160126, 'eval_mae': 0.1149284690618515, 'eval_r2': 0.4128429144005116, 'eval_smape': 24.264864801646706, 'eval_runtime': 4.2565, 'eval_samples_per_second': 235.403, 'eval_steps_per_second': 29.602, 'epoch': 5.54}


[2m[36m(train_distilbert pid=9037)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
[2m[36m(train_distilbert pid=9037)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=9037)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=9037)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=9037)[0m 
[2m[36m(train_distilbert pid=9037)[0m 
[2m[36m(train_distilbert pid=9037)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=9037)[0m 
[2m[36m(train_distilbert pid=9037)[0m 
[2m[36m(train_distilbert pid=9037)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=9037)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=9037)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=9037)[0m   Batch size = 8


[2m[36m(train_distilbert pid=9037)[0m {'train_runtime': 48.5344, 'train_samples_per_second': 73.301, 'train_steps_per_second': 9.189, 'train_loss': 0.044266204662921714, 'epoch': 2.09}


Trial train_distilbert_72f93306 reported r2=0.07 with parameters={'num_train_epochs': 2.09148471370044, 'learning_rate': 1.6604822902548753e-06, 'adam_epsilon': 5.5655768599693614e-08, 'adam_beta1': 0.8953378957463479, 'adam_beta2': 0.98876360832051}.
Trial train_distilbert_72f93306 completed. Last result: r2=0.06657254675072433
[2m[36m(train_distilbert pid=9037)[0m {'eval_loss': 0.03349848464131355, 'eval_mse': 0.03349848464131355, 'eval_rmse': 0.18302591145038605, 'eval_mae': 0.15085244178771973, 'eval_r2': 0.06657254675072433, 'eval_smape': 30.85793062312874, 'eval_runtime': 4.2639, 'eval_samples_per_second': 234.995, 'eval_steps_per_second': 29.55, 'epoch': 2.09}


[2m[36m(train_distilbert pid=9140)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=9140)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=9140)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=9140)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=9140)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=9140)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=9140)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=9140)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=9140)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=9140)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=9140)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=9140)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=9140)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=9140)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=9140)[0m Saving model checkpoint to ./checkpoint-1500
[2m[36m(train_distilbert pid=9140)[0m Configuration saved in ./checkpoint-1500/config.json
[2m[36m(train_distilbert pid=9140)[0m Model weights saved in ./checkpoint-1500/pytorch_model.bin
[2m[36m(train_distilbert pid=9140)[0m tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
[2m[36m(train_distilbert pid=9140)[0m Special tokens file saved in ./checkpoint-1500/special_tokens_map.json


[2m[36m(train_distilbert pid=9140)[0m Saving model checkpoint to ./checkpoint-2000
[2m[36m(train_distilbert pid=9140)[0m Configuration saved in ./checkpoint-2000/config.json
[2m[36m(train_distilbert pid=9140)[0m Model weights saved in ./checkpoint-2000/pytorch_model.bin
[2m[36m(train_distilbert pid=9140)[0m tokenizer config file saved in ./checkpoint-2000/tokenizer_config.json
[2m[36m(train_distilbert pid=9140)[0m Special tokens file saved in ./checkpoint-2000/special_tokens_map.json


[2m[36m(train_distilbert pid=9140)[0m {'train_runtime': 266.6454, 'train_samples_per_second': 72.712, 'train_steps_per_second': 9.106, 'train_loss': 0.014392284035093506, 'epoch': 11.4}


[2m[36m(train_distilbert pid=9140)[0m 
[2m[36m(train_distilbert pid=9140)[0m 
[2m[36m(train_distilbert pid=9140)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=9140)[0m 
[2m[36m(train_distilbert pid=9140)[0m 
[2m[36m(train_distilbert pid=9140)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=9140)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=9140)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=9140)[0m   Batch size = 8


Trial train_distilbert_c5618134 reported r2=0.46 with parameters={'num_train_epochs': 11.398156531060607, 'learning_rate': 3.6761194595330126e-06, 'adam_epsilon': 3.9653295739867804e-08, 'adam_beta1': 0.8522823266823952, 'adam_beta2': 0.9905817111380364}.
Trial train_distilbert_c5618134 completed. Last result: r2=0.4572298475992562
[2m[36m(train_distilbert pid=9140)[0m {'eval_loss': 0.019478727132081985, 'eval_mse': 0.019478727132081985, 'eval_rmse': 0.13956621289253235, 'eval_mae': 0.11009307950735092, 'eval_r2': 0.4572298475992562, 'eval_smape': 23.430552956586823, 'eval_runtime': 4.26, 'eval_samples_per_second': 235.213, 'eval_steps_per_second': 29.578, 'epoch': 11.4}


[2m[36m(train_distilbert pid=9334)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight']
[2m[36m(train_distilbert pid=9334)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=9334)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=9334)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=9334)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=9334)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=9334)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=9334)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=9334)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=9334)[0m {'train_runtime': 63.7253, 'train_samples_per_second': 71.747, 'train_steps_per_second': 8.992, 'train_loss': 0.02822744492668964, 'epoch': 2.69}


[2m[36m(train_distilbert pid=9334)[0m 
[2m[36m(train_distilbert pid=9334)[0m 
[2m[36m(train_distilbert pid=9334)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=9334)[0m 
[2m[36m(train_distilbert pid=9334)[0m 
[2m[36m(train_distilbert pid=9334)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=9334)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=9334)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=9334)[0m   Batch size = 8


Trial train_distilbert_eaeec7a4 reported r2=0.25 with parameters={'num_train_epochs': 2.6879040920577384, 'learning_rate': 3.790500642570375e-06, 'adam_epsilon': 1.1654556926153477e-08, 'adam_beta1': 0.8392906624377373, 'adam_beta2': 0.9893368610443235}.
Trial train_distilbert_eaeec7a4 completed. Last result: r2=0.25332924435949644
[2m[36m(train_distilbert pid=9334)[0m {'eval_loss': 0.026796234771609306, 'eval_mse': 0.026796234771609306, 'eval_rmse': 0.1636955589056015, 'eval_mae': 0.13194753229618073, 'eval_r2': 0.25332924435949644, 'eval_smape': 27.356332257360275, 'eval_runtime': 4.2606, 'eval_samples_per_second': 235.179, 'eval_steps_per_second': 29.573, 'epoch': 2.69}


[2m[36m(train_distilbert pid=9431)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
[2m[36m(train_distilbert pid=9431)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=9431)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=9431)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=9431)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=9431)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=9431)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=9431)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=9431)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=9431)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=9431)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=9431)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=9431)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=9431)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=9431)[0m Saving model checkpoint to ./checkpoint-1500
[2m[36m(train_distilbert pid=9431)[0m Configuration saved in ./checkpoint-1500/config.json
[2m[36m(train_distilbert pid=9431)[0m Model weights saved in ./checkpoint-1500/pytorch_model.bin
[2m[36m(train_distilbert pid=9431)[0m tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
[2m[36m(train_distilbert pid=9431)[0m Special tokens file saved in ./checkpoint-1500/special_tokens_map.json


[2m[36m(train_distilbert pid=9431)[0m Saving model checkpoint to ./checkpoint-2000
[2m[36m(train_distilbert pid=9431)[0m Configuration saved in ./checkpoint-2000/config.json
[2m[36m(train_distilbert pid=9431)[0m Model weights saved in ./checkpoint-2000/pytorch_model.bin
[2m[36m(train_distilbert pid=9431)[0m tokenizer config file saved in ./checkpoint-2000/tokenizer_config.json
[2m[36m(train_distilbert pid=9431)[0m Special tokens file saved in ./checkpoint-2000/special_tokens_map.json


[2m[36m(train_distilbert pid=9431)[0m Saving model checkpoint to ./checkpoint-2500
[2m[36m(train_distilbert pid=9431)[0m Configuration saved in ./checkpoint-2500/config.json
[2m[36m(train_distilbert pid=9431)[0m Model weights saved in ./checkpoint-2500/pytorch_model.bin
[2m[36m(train_distilbert pid=9431)[0m tokenizer config file saved in ./checkpoint-2500/tokenizer_config.json
[2m[36m(train_distilbert pid=9431)[0m Special tokens file saved in ./checkpoint-2500/special_tokens_map.json


[2m[36m(train_distilbert pid=9431)[0m 
[2m[36m(train_distilbert pid=9431)[0m 
[2m[36m(train_distilbert pid=9431)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=9431)[0m 
[2m[36m(train_distilbert pid=9431)[0m 
[2m[36m(train_distilbert pid=9431)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=9431)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=9431)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=9431)[0m   Batch size = 8


[2m[36m(train_distilbert pid=9431)[0m {'train_runtime': 303.6434, 'train_samples_per_second': 72.649, 'train_steps_per_second': 9.099, 'train_loss': 0.01431727538797418, 'epoch': 12.97}


Trial train_distilbert_926f0a98 reported r2=0.45 with parameters={'num_train_epochs': 12.968568022421785, 'learning_rate': 4.088618327523183e-06, 'adam_epsilon': 3.233810018632289e-08, 'adam_beta1': 0.8889663281412874, 'adam_beta2': 0.9925003306754846}.
Trial train_distilbert_926f0a98 completed. Last result: r2=0.45110030223499675
[2m[36m(train_distilbert pid=9431)[0m {'eval_loss': 0.019698701798915863, 'eval_mse': 0.019698699936270714, 'eval_rmse': 0.1403520554304123, 'eval_mae': 0.11162743717432022, 'eval_r2': 0.45110030223499675, 'eval_smape': 23.894749563373253, 'eval_runtime': 4.2447, 'eval_samples_per_second': 236.059, 'eval_steps_per_second': 29.684, 'epoch': 12.97}


[2m[36m(train_distilbert pid=9653)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight']
[2m[36m(train_distilbert pid=9653)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=9653)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=9653)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=9653)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=9653)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=9653)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=9653)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=9653)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=9653)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=9653)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=9653)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=9653)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=9653)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=9653)[0m Saving model checkpoint to ./checkpoint-1500
[2m[36m(train_distilbert pid=9653)[0m Configuration saved in ./checkpoint-1500/config.json
[2m[36m(train_distilbert pid=9653)[0m Model weights saved in ./checkpoint-1500/pytorch_model.bin
[2m[36m(train_distilbert pid=9653)[0m tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
[2m[36m(train_distilbert pid=9653)[0m Special tokens file saved in ./checkpoint-1500/special_tokens_map.json


[2m[36m(train_distilbert pid=9653)[0m Saving model checkpoint to ./checkpoint-2000
[2m[36m(train_distilbert pid=9653)[0m Configuration saved in ./checkpoint-2000/config.json
[2m[36m(train_distilbert pid=9653)[0m Model weights saved in ./checkpoint-2000/pytorch_model.bin
[2m[36m(train_distilbert pid=9653)[0m tokenizer config file saved in ./checkpoint-2000/tokenizer_config.json
[2m[36m(train_distilbert pid=9653)[0m Special tokens file saved in ./checkpoint-2000/special_tokens_map.json


[2m[36m(train_distilbert pid=9653)[0m 
[2m[36m(train_distilbert pid=9653)[0m 
[2m[36m(train_distilbert pid=9653)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=9653)[0m 
[2m[36m(train_distilbert pid=9653)[0m 
[2m[36m(train_distilbert pid=9653)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=9653)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=9653)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=9653)[0m   Batch size = 8


[2m[36m(train_distilbert pid=9653)[0m {'train_runtime': 235.2822, 'train_samples_per_second': 72.426, 'train_steps_per_second': 9.07, 'train_loss': 0.017194839091421627, 'epoch': 10.02}


Trial train_distilbert_c0ecec6e reported r2=0.41 with parameters={'num_train_epochs': 10.017911929978716, 'learning_rate': 3.305237417194613e-06, 'adam_epsilon': 4.8623260302052156e-08, 'adam_beta1': 0.8155983252235031, 'adam_beta2': 0.9886668005171655}.
Trial train_distilbert_c0ecec6e completed. Last result: r2=0.40581959799049294
[2m[36m(train_distilbert pid=9653)[0m {'eval_loss': 0.02132371813058853, 'eval_mse': 0.02132371813058853, 'eval_rmse': 0.14602643251419067, 'eval_mae': 0.1161208227276802, 'eval_r2': 0.40581959799049294, 'eval_smape': 24.56076128992016, 'eval_runtime': 4.2516, 'eval_samples_per_second': 235.675, 'eval_steps_per_second': 29.636, 'epoch': 10.02}


[2m[36m(train_distilbert pid=9844)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight']
[2m[36m(train_distilbert pid=9844)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=9844)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=9844)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=9844)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=9844)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=9844)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=9844)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=9844)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=9844)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=9844)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=9844)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=9844)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=9844)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=9844)[0m Saving model checkpoint to ./checkpoint-1500
[2m[36m(train_distilbert pid=9844)[0m Configuration saved in ./checkpoint-1500/config.json
[2m[36m(train_distilbert pid=9844)[0m Model weights saved in ./checkpoint-1500/pytorch_model.bin
[2m[36m(train_distilbert pid=9844)[0m tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
[2m[36m(train_distilbert pid=9844)[0m Special tokens file saved in ./checkpoint-1500/special_tokens_map.json


[2m[36m(train_distilbert pid=9844)[0m 
[2m[36m(train_distilbert pid=9844)[0m 
[2m[36m(train_distilbert pid=9844)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=9844)[0m 
[2m[36m(train_distilbert pid=9844)[0m 
[2m[36m(train_distilbert pid=9844)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=9844)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=9844)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=9844)[0m   Batch size = 8


[2m[36m(train_distilbert pid=9844)[0m {'train_runtime': 203.8052, 'train_samples_per_second': 72.759, 'train_steps_per_second': 9.112, 'train_loss': 0.01803072845672624, 'epoch': 8.72}


Trial train_distilbert_7de50946 reported r2=0.42 with parameters={'num_train_epochs': 8.717630602766743, 'learning_rate': 3.1584641211444963e-06, 'adam_epsilon': 2.278501307223444e-08, 'adam_beta1': 0.8628283030522914, 'adam_beta2': 0.9873028599648658}.
Trial train_distilbert_7de50946 completed. Last result: r2=0.4225469945997752
[2m[36m(train_distilbert pid=9844)[0m {'eval_loss': 0.020723411813378334, 'eval_mse': 0.020723411813378334, 'eval_rmse': 0.1439562886953354, 'eval_mae': 0.11334699392318726, 'eval_r2': 0.4225469945997752, 'eval_smape': 23.86851492327844, 'eval_runtime': 4.2429, 'eval_samples_per_second': 236.159, 'eval_steps_per_second': 29.697, 'epoch': 8.72}


[2m[36m(train_distilbert pid=10020)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=10020)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=10020)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=10020)[0m Some weights of DistilBertForSequenceClas

[2m[36m(train_distilbert pid=10020)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=10020)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=10020)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=10020)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=10020)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=10020)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=10020)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=10020)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=10020)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=10020)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=10020)[0m Saving model checkpoint to ./checkpoint-1500
[2m[36m(train_distilbert pid=10020)[0m Configuration saved in ./checkpoint-1500/config.json
[2m[36m(train_distilbert pid=10020)[0m Model weights saved in ./checkpoint-1500/pytorch_model.bin
[2m[36m(train_distilbert pid=10020)[0m tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
[2m[36m(train_distilbert pid=10020)[0m Special tokens file saved in ./checkpoint-1500/special_tokens_map.json


[2m[36m(train_distilbert pid=10020)[0m Saving model checkpoint to ./checkpoint-2000
[2m[36m(train_distilbert pid=10020)[0m Configuration saved in ./checkpoint-2000/config.json
[2m[36m(train_distilbert pid=10020)[0m Model weights saved in ./checkpoint-2000/pytorch_model.bin
[2m[36m(train_distilbert pid=10020)[0m tokenizer config file saved in ./checkpoint-2000/tokenizer_config.json
[2m[36m(train_distilbert pid=10020)[0m Special tokens file saved in ./checkpoint-2000/special_tokens_map.json


[2m[36m(train_distilbert pid=10020)[0m Saving model checkpoint to ./checkpoint-2500
[2m[36m(train_distilbert pid=10020)[0m Configuration saved in ./checkpoint-2500/config.json
[2m[36m(train_distilbert pid=10020)[0m Model weights saved in ./checkpoint-2500/pytorch_model.bin
[2m[36m(train_distilbert pid=10020)[0m tokenizer config file saved in ./checkpoint-2500/tokenizer_config.json
[2m[36m(train_distilbert pid=10020)[0m Special tokens file saved in ./checkpoint-2500/special_tokens_map.json


[2m[36m(train_distilbert pid=10020)[0m Saving model checkpoint to ./checkpoint-3000
[2m[36m(train_distilbert pid=10020)[0m Configuration saved in ./checkpoint-3000/config.json
[2m[36m(train_distilbert pid=10020)[0m Model weights saved in ./checkpoint-3000/pytorch_model.bin
[2m[36m(train_distilbert pid=10020)[0m tokenizer config file saved in ./checkpoint-3000/tokenizer_config.json
[2m[36m(train_distilbert pid=10020)[0m Special tokens file saved in ./checkpoint-3000/special_tokens_map.json


[2m[36m(train_distilbert pid=10020)[0m {'train_runtime': 349.3371, 'train_samples_per_second': 72.566, 'train_steps_per_second': 9.089, 'train_loss': 0.012158908393439345, 'epoch': 14.91}


[2m[36m(train_distilbert pid=10020)[0m 
[2m[36m(train_distilbert pid=10020)[0m 
[2m[36m(train_distilbert pid=10020)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=10020)[0m 
[2m[36m(train_distilbert pid=10020)[0m 
[2m[36m(train_distilbert pid=10020)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=10020)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=10020)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=10020)[0m   Batch size = 8


Trial train_distilbert_1250f7d4 reported r2=0.46 with parameters={'num_train_epochs': 14.90289944900019, 'learning_rate': 4.278615732972273e-06, 'adam_epsilon': 6.900956598306757e-08, 'adam_beta1': 0.841736350312499, 'adam_beta2': 0.9938714514369775}.
Trial train_distilbert_1250f7d4 completed. Last result: r2=0.46146595422366776
[2m[36m(train_distilbert pid=10020)[0m {'eval_loss': 0.019326703622937202, 'eval_mse': 0.019326703622937202, 'eval_rmse': 0.13902051746845245, 'eval_mae': 0.11033818870782852, 'eval_r2': 0.46146595422366776, 'eval_smape': 23.62718508295908, 'eval_runtime': 4.2322, 'eval_samples_per_second': 236.758, 'eval_steps_per_second': 29.772, 'epoch': 14.91}


[2m[36m(train_distilbert pid=10262)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias']
[2m[36m(train_distilbert pid=10262)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=10262)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=10262)[0m Some weights of DistilBertForSequenceClas

[2m[36m(train_distilbert pid=10262)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=10262)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=10262)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=10262)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=10262)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=10262)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=10262)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=10262)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=10262)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=10262)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=10262)[0m Saving model checkpoint to ./checkpoint-1500
[2m[36m(train_distilbert pid=10262)[0m Configuration saved in ./checkpoint-1500/config.json
[2m[36m(train_distilbert pid=10262)[0m Model weights saved in ./checkpoint-1500/pytorch_model.bin
[2m[36m(train_distilbert pid=10262)[0m tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
[2m[36m(train_distilbert pid=10262)[0m Special tokens file saved in ./checkpoint-1500/special_tokens_map.json


[2m[36m(train_distilbert pid=10262)[0m Saving model checkpoint to ./checkpoint-2000
[2m[36m(train_distilbert pid=10262)[0m Configuration saved in ./checkpoint-2000/config.json
[2m[36m(train_distilbert pid=10262)[0m Model weights saved in ./checkpoint-2000/pytorch_model.bin
[2m[36m(train_distilbert pid=10262)[0m tokenizer config file saved in ./checkpoint-2000/tokenizer_config.json
[2m[36m(train_distilbert pid=10262)[0m Special tokens file saved in ./checkpoint-2000/special_tokens_map.json


[2m[36m(train_distilbert pid=10262)[0m {'train_runtime': 221.3271, 'train_samples_per_second': 72.56, 'train_steps_per_second': 9.086, 'train_loss': 0.015231481065565173, 'epoch': 9.44}


[2m[36m(train_distilbert pid=10262)[0m 
[2m[36m(train_distilbert pid=10262)[0m 
[2m[36m(train_distilbert pid=10262)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=10262)[0m 
[2m[36m(train_distilbert pid=10262)[0m 
[2m[36m(train_distilbert pid=10262)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=10262)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=10262)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=10262)[0m   Batch size = 8


Trial train_distilbert_944cf184 reported r2=0.43 with parameters={'num_train_epochs': 9.441201639947518, 'learning_rate': 5.86837067851563e-06, 'adam_epsilon': 5.630627597473846e-08, 'adam_beta1': 0.8670967561056424, 'adam_beta2': 0.9873909813092763}.
Trial train_distilbert_944cf184 completed. Last result: r2=0.4279118856069456
[2m[36m(train_distilbert pid=10262)[0m {'eval_loss': 0.020530879497528076, 'eval_mse': 0.020530879497528076, 'eval_rmse': 0.14328600466251373, 'eval_mae': 0.11427132040262222, 'eval_r2': 0.4279118856069456, 'eval_smape': 24.2191573883483, 'eval_runtime': 4.2259, 'eval_samples_per_second': 237.11, 'eval_steps_per_second': 29.816, 'epoch': 9.44}


[2m[36m(train_distilbert pid=10447)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
[2m[36m(train_distilbert pid=10447)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=10447)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=10447)[0m Some weights of DistilBertForSequenceClas

[2m[36m(train_distilbert pid=10447)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=10447)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=10447)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=10447)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=10447)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=10447)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=10447)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=10447)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=10447)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=10447)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=10447)[0m Saving model checkpoint to ./checkpoint-1500
[2m[36m(train_distilbert pid=10447)[0m Configuration saved in ./checkpoint-1500/config.json
[2m[36m(train_distilbert pid=10447)[0m Model weights saved in ./checkpoint-1500/pytorch_model.bin
[2m[36m(train_distilbert pid=10447)[0m tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
[2m[36m(train_distilbert pid=10447)[0m Special tokens file saved in ./checkpoint-1500/special_tokens_map.json


[2m[36m(train_distilbert pid=10447)[0m Saving model checkpoint to ./checkpoint-2000
[2m[36m(train_distilbert pid=10447)[0m Configuration saved in ./checkpoint-2000/config.json
[2m[36m(train_distilbert pid=10447)[0m Model weights saved in ./checkpoint-2000/pytorch_model.bin
[2m[36m(train_distilbert pid=10447)[0m tokenizer config file saved in ./checkpoint-2000/tokenizer_config.json
[2m[36m(train_distilbert pid=10447)[0m Special tokens file saved in ./checkpoint-2000/special_tokens_map.json


[2m[36m(train_distilbert pid=10447)[0m Saving model checkpoint to ./checkpoint-2500
[2m[36m(train_distilbert pid=10447)[0m Configuration saved in ./checkpoint-2500/config.json
[2m[36m(train_distilbert pid=10447)[0m Model weights saved in ./checkpoint-2500/pytorch_model.bin
[2m[36m(train_distilbert pid=10447)[0m tokenizer config file saved in ./checkpoint-2500/tokenizer_config.json
[2m[36m(train_distilbert pid=10447)[0m Special tokens file saved in ./checkpoint-2500/special_tokens_map.json


[2m[36m(train_distilbert pid=10447)[0m Saving model checkpoint to ./checkpoint-3000
[2m[36m(train_distilbert pid=10447)[0m Configuration saved in ./checkpoint-3000/config.json
[2m[36m(train_distilbert pid=10447)[0m Model weights saved in ./checkpoint-3000/pytorch_model.bin
[2m[36m(train_distilbert pid=10447)[0m tokenizer config file saved in ./checkpoint-3000/tokenizer_config.json
[2m[36m(train_distilbert pid=10447)[0m Special tokens file saved in ./checkpoint-3000/special_tokens_map.json


[2m[36m(train_distilbert pid=10447)[0m Saving model checkpoint to ./checkpoint-3500
[2m[36m(train_distilbert pid=10447)[0m Configuration saved in ./checkpoint-3500/config.json
[2m[36m(train_distilbert pid=10447)[0m Model weights saved in ./checkpoint-3500/pytorch_model.bin
[2m[36m(train_distilbert pid=10447)[0m tokenizer config file saved in ./checkpoint-3500/tokenizer_config.json
[2m[36m(train_distilbert pid=10447)[0m Special tokens file saved in ./checkpoint-3500/special_tokens_map.json


[2m[36m(train_distilbert pid=10447)[0m {'train_runtime': 420.9361, 'train_samples_per_second': 72.705, 'train_steps_per_second': 9.106, 'train_loss': 0.013060322200975187, 'epoch': 18.0}


[2m[36m(train_distilbert pid=10447)[0m 
[2m[36m(train_distilbert pid=10447)[0m 
[2m[36m(train_distilbert pid=10447)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=10447)[0m 
[2m[36m(train_distilbert pid=10447)[0m 
[2m[36m(train_distilbert pid=10447)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=10447)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=10447)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=10447)[0m   Batch size = 8


Trial train_distilbert_6ca717c6 reported r2=0.45 with parameters={'num_train_epochs': 17.991945004926876, 'learning_rate': 2.6802503484360588e-06, 'adam_epsilon': 4.859949768360106e-08, 'adam_beta1': 0.8269219208892519, 'adam_beta2': 0.9970831227466022}.
Trial train_distilbert_6ca717c6 completed. Last result: r2=0.45012298753805813
[2m[36m(train_distilbert pid=10447)[0m {'eval_loss': 0.019733775407075882, 'eval_mse': 0.019733775407075882, 'eval_rmse': 0.14047695696353912, 'eval_mae': 0.11049866676330566, 'eval_r2': 0.45012298753805813, 'eval_smape': 23.525014424276446, 'eval_runtime': 4.2267, 'eval_samples_per_second': 237.064, 'eval_steps_per_second': 29.81, 'epoch': 18.0}


[2m[36m(train_distilbert pid=10665)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=10665)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=10665)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=10665)[0m Some weights of DistilBertForSequenceClas

[2m[36m(train_distilbert pid=10665)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=10665)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=10665)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=10665)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=10665)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=10665)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=10665)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=10665)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=10665)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=10665)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=10665)[0m Saving model checkpoint to ./checkpoint-1500
[2m[36m(train_distilbert pid=10665)[0m Configuration saved in ./checkpoint-1500/config.json
[2m[36m(train_distilbert pid=10665)[0m Model weights saved in ./checkpoint-1500/pytorch_model.bin
[2m[36m(train_distilbert pid=10665)[0m tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
[2m[36m(train_distilbert pid=10665)[0m Special tokens file saved in ./checkpoint-1500/special_tokens_map.json


[2m[36m(train_distilbert pid=10665)[0m Saving model checkpoint to ./checkpoint-2000
[2m[36m(train_distilbert pid=10665)[0m Configuration saved in ./checkpoint-2000/config.json
[2m[36m(train_distilbert pid=10665)[0m Model weights saved in ./checkpoint-2000/pytorch_model.bin
[2m[36m(train_distilbert pid=10665)[0m tokenizer config file saved in ./checkpoint-2000/tokenizer_config.json
[2m[36m(train_distilbert pid=10665)[0m Special tokens file saved in ./checkpoint-2000/special_tokens_map.json


[2m[36m(train_distilbert pid=10665)[0m Saving model checkpoint to ./checkpoint-2500
[2m[36m(train_distilbert pid=10665)[0m Configuration saved in ./checkpoint-2500/config.json
[2m[36m(train_distilbert pid=10665)[0m Model weights saved in ./checkpoint-2500/pytorch_model.bin
[2m[36m(train_distilbert pid=10665)[0m tokenizer config file saved in ./checkpoint-2500/tokenizer_config.json
[2m[36m(train_distilbert pid=10665)[0m Special tokens file saved in ./checkpoint-2500/special_tokens_map.json


[2m[36m(train_distilbert pid=10665)[0m Saving model checkpoint to ./checkpoint-3000
[2m[36m(train_distilbert pid=10665)[0m Configuration saved in ./checkpoint-3000/config.json
[2m[36m(train_distilbert pid=10665)[0m Model weights saved in ./checkpoint-3000/pytorch_model.bin
[2m[36m(train_distilbert pid=10665)[0m tokenizer config file saved in ./checkpoint-3000/tokenizer_config.json
[2m[36m(train_distilbert pid=10665)[0m Special tokens file saved in ./checkpoint-3000/special_tokens_map.json


2022-12-01 23:14:06,936	INFO stopper.py:364 -- Reached timeout of 3600 seconds. Stopping all trials.


Trial name,status,loc,adam_beta1,adam_beta2,adam_epsilon,learning_rate,num_train_epochs,iter,total time (s),r2
train_distilbert_75886418,TERMINATED,172.28.0.2:7468,0.868934,0.98729,1.9703e-08,1.56626e-06,1.0,1.0,30.3793,0.0350193
train_distilbert_797cc10e,TERMINATED,172.28.0.2:7558,0.94368,0.980268,3.41024e-09,3.2432e-05,1.44427,1.0,40.3436,0.420948
train_distilbert_8c1c4f3c,TERMINATED,172.28.0.2:7656,0.83925,0.989131,3.57728e-08,1.46966e-06,1.0,1.0,30.4707,0.00501224
train_distilbert_a81fc376,TERMINATED,172.28.0.2:7747,0.898618,0.985452,1.08521e-08,1.66921e-06,1.02723,1.0,31.2992,0.00896462
train_distilbert_be2eb96a,TERMINATED,172.28.0.2:7839,0.892053,0.987263,1.7536e-08,1.41402e-06,1.0,1.0,30.5941,0.019258
train_distilbert_d4d3bcce,TERMINATED,172.28.0.2:7932,0.845815,0.987317,2.21377e-08,1.73489e-06,2.15443,1.0,56.6846,0.104871
train_distilbert_eae45e74,TERMINATED,172.28.0.2:8035,0.856788,0.988564,3.92079e-08,2.81634e-06,1.0,1.0,30.5621,0.0509921
train_distilbert_110ae6f4,TERMINATED,172.28.0.2:8128,0.857961,0.986044,1.11248e-08,1e-06,2.72935,1.0,71.3863,0.047188
train_distilbert_27172e26,TERMINATED,172.28.0.2:8238,0.806751,0.987522,2.67955e-08,1.95709e-06,1.59538,1.0,44.2964,0.143462
train_distilbert_55961c9e,TERMINATED,172.28.0.2:8336,0.884879,0.987112,1.82896e-08,1.53792e-06,2.90939,1.0,75.5793,0.181381


[2m[36m(train_distilbert pid=10665)[0m Saving model checkpoint to ./checkpoint-3500
[2m[36m(train_distilbert pid=10665)[0m Configuration saved in ./checkpoint-3500/config.json
[2m[36m(train_distilbert pid=10665)[0m Model weights saved in ./checkpoint-3500/pytorch_model.bin
[2m[36m(train_distilbert pid=10665)[0m tokenizer config file saved in ./checkpoint-3500/tokenizer_config.json
[2m[36m(train_distilbert pid=10665)[0m Special tokens file saved in ./checkpoint-3500/special_tokens_map.json
[2m[36m(train_distilbert pid=10665)[0m Saving model checkpoint to ./checkpoint-4000
[2m[36m(train_distilbert pid=10665)[0m Configuration saved in ./checkpoint-4000/config.json
[2m[36m(train_distilbert pid=10665)[0m Model weights saved in ./checkpoint-4000/pytorch_model.bin
[2m[36m(train_distilbert pid=10665)[0m tokenizer config file saved in ./checkpoint-4000/tokenizer_config.json
[2m[36m(train_distilbert pid=10665)[0m Special tokens file saved in ./checkpoint-4000/specia

[2m[36m(train_distilbert pid=10665)[0m {'train_runtime': 474.7559, 'train_samples_per_second': 72.816, 'train_steps_per_second': 9.118, 'train_loss': 0.010420797713278182, 'epoch': 20.32}


2022-12-01 23:16:03,226	INFO tune.py:748 -- Total run time: 3722.07 seconds (3605.67 seconds for the tuning loop).


[2m[36m(train_distilbert pid=10665)[0m {'eval_loss': 0.020890725776553154, 'eval_mse': 0.020890725776553154, 'eval_rmse': 0.14453624188899994, 'eval_mae': 0.11490651220083237, 'eval_r2': 0.4178848508485151, 'eval_smape': 24.54094545284431, 'eval_runtime': 4.2376, 'eval_samples_per_second': 236.453, 'eval_steps_per_second': 29.734, 'epoch': 20.32}


In [None]:
#Get the results of hyperparameters optimization 
best_trial = analysis.get_best_trial(HP_METRIC, MODE, "all")
metric = best_trial.metric_analysis[HP_METRIC][MODE]
print(f"n_trials={len(analysis.trials)}")
print(f"time={time.time()-start_time}")
print(f"Best model eval {HP_METRIC}: {metric:.4f}")
print(f"Best model parameters: {best_trial.config}")

n_trials=26
time=3748.902177810669
Best model eval r2: 0.4615
Best model parameters: {'num_train_epochs': 14.90289944900019, 'learning_rate': 4.278615732972273e-06, 'adam_epsilon': 6.900956598306757e-08, 'adam_beta1': 0.841736350312499, 'adam_beta2': 0.9938714514369775}


In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/pytorch_model.bin
Some we

In [None]:
model.resize_token_embeddings(len(tokenizer))

Embedding(30522, 768, padding_idx=0)

In [None]:
#Now use the best hyperparameters found to train the model
training_args = TrainingArguments(output_dir="test_trainer",
                                  logging_steps=20000,
                                  per_device_train_batch_size=64,
                                  per_device_eval_batch_size=20,
                                  learning_rate = 4.278615732972273e-06,
                                  adam_epsilon = 6.900956598306757e-08,
                                  adam_beta1 = 0.841736350312499,
                                  adam_beta2 = 0.9938714514369775,
                                  num_train_epochs=15,
                                  )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics_for_regression
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
#Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1701
  Num Epochs = 15
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 405
  Number of trainable parameters = 66954241


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=405, training_loss=0.010689811942018108, metrics={'train_runtime': 287.6483, 'train_samples_per_second': 88.702, 'train_steps_per_second': 1.408, 'total_flos': 3379845400888320.0, 'train_loss': 0.010689811942018108, 'epoch': 15.0})

In [None]:
#Evaluate the model
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1002
  Batch size = 20


{'eval_loss': 0.02048257365822792,
 'eval_mse': 0.02048257552087307,
 'eval_rmse': 0.14311735332012177,
 'eval_mae': 0.11232398450374603,
 'eval_r2': 0.4292578633151708,
 'eval_smape': 23.861486012350298,
 'eval_runtime': 4.0928,
 'eval_samples_per_second': 244.819,
 'eval_steps_per_second': 12.461,
 'epoch': 15.0}

# Fifth model

In [None]:
#Trying with different training arguments 
training_args = TrainingArguments(output_dir="test_trainer",
                                  logging_steps=20000,
                                  #per_device_train_batch_size=64,
                                  #per_device_eval_batch_size=20,
                                  learning_rate = 4.278615732972273e-06,
                                  adam_epsilon = 6.900956598306757e-08,
                                  adam_beta1 = 0.841736350312499,
                                  adam_beta2 = 0.9938714514369775,
                                  num_train_epochs=50,
                                  )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics_for_regression
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
#Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1701
  Num Epochs = 50
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10650
  Number of trainable parameters = 66954241


Step,Training Loss


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2000
Configuration saved in test_trainer/checkpoint-2000/config.json
Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2500
Configuration saved in test_trainer/checkpoint-2500/config.json
Model weights saved in test_trainer/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-30

TrainOutput(global_step=10650, training_loss=0.0033621011653416594, metrics={'train_runtime': 1220.3216, 'train_samples_per_second': 69.695, 'train_steps_per_second': 8.727, 'total_flos': 1.12661513362944e+16, 'train_loss': 0.0033621011653416594, 'epoch': 50.0})

In [None]:
#Evaluate the model
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1002
  Batch size = 8


{'eval_loss': 0.020279884338378906,
 'eval_mse': 0.020279884338378906,
 'eval_rmse': 0.14240746200084686,
 'eval_mae': 0.11204035580158234,
 'eval_r2': 0.4349057738233796,
 'eval_smape': 23.816269804141715,
 'eval_runtime': 4.446,
 'eval_samples_per_second': 225.371,
 'eval_steps_per_second': 28.34,
 'epoch': 50.0}

# New ray search and Sixth model

In [73]:
from transformers import TrainingArguments, Trainer

In [74]:
MODEL_CHECKPOINT = "distilbert-base-uncased"

In [34]:
#Download the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=1)

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.w

In [35]:
#Re-size to the length of the tokenizer 
model.resize_token_embeddings(len(tokenizer))

Embedding(30522, 768, padding_idx=0)

In [64]:
#Traing args
args = TrainingArguments(
    output_dir='output',
    do_eval=True,
)

In [65]:
#Define the trainer 
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_for_regression,
)

In [None]:
#Train the model
trainer.train()

In [66]:
#Evaluate the model
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1002
  Batch size = 8
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.4004843831062317,
 'eval_mse': 0.4004843831062317,
 'eval_rmse': 0.6328383684158325,
 'eval_mae': 0.6030510663986206,
 'eval_r2': -10.159403175849627,
 'eval_smape': 199.99999999999997,
 'eval_runtime': 7.6485,
 'eval_samples_per_second': 131.005,
 'eval_steps_per_second': 16.474}

In [36]:
!pip install flaml

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [37]:
import flaml

In [69]:
def train_distilbert(config: dict): #Preparing the function for another tuning 

    

    train_dataset, eval_dataset = tokenized_train_dataset, tokenized_val_dataset

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_CHECKPOINT, num_labels=1
    )
    model.resize_token_embeddings(len(tokenizer))

    

    def compute_metrics_for_regression(eval_pred):
        logits, labels = eval_pred
        labels = labels.reshape(-1, 1)

        mse = mean_squared_error(labels, logits)
        rmse = mean_squared_error(labels, logits, squared=False)
        mae = mean_absolute_error(labels, logits)
        r2 = r2_score(labels, logits)
        smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)

        return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}


    training_args = TrainingArguments(
        output_dir='.',
        do_eval=False,
        disable_tqdm=True,
        logging_steps=20000,
        save_total_limit=0,
        **config,
    )

    trainer = Trainer(
        model,
        training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_for_regression,
    )

    # train model
    trainer.train()

    # evaluate model
    eval_output = trainer.evaluate()

    # report the metric to optimize
    flaml.tune.report(
        r2=eval_output["eval_r2"],
    )

In [70]:
############ Version 2 ########################
#Different search space for hyperparameters tuning

max_num_epoch = 64
search_space = {
        # You can mix constants with search space objects.
        "num_train_epochs": flaml.tune.loguniform(1, max_num_epoch),
        "learning_rate": flaml.tune.loguniform(1e-6, 1e-4),
        "adam_epsilon": flaml.tune.loguniform(1e-9, 1e-7),
        "adam_beta1": flaml.tune.uniform(0.8, 0.99),
        "adam_beta2": flaml.tune.loguniform(98e-2, 9999e-4),
        "weight_decay": flaml.tune.uniform(0.0, 0.3),
        
}

In [71]:
# optimization objective
HP_METRIC, MODE = "r2", "max"

# resources
num_cpus = 0
num_gpus = 1

# constraints
num_samples = -1    # number of trials, -1 means unlimited
time_budget_s = 3600    # time budget in seconds

In [38]:
!pip install pickle5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pickle5
  Downloading pickle5-0.0.11.tar.gz (132 kB)
[K     |████████████████████████████████| 132 kB 15.8 MB/s 
[?25hBuilding wheels for collected packages: pickle5
  Building wheel for pickle5 (setup.py) ... [?25l[?25hdone
  Created wheel for pickle5: filename=pickle5-0.0.11-cp38-cp38-linux_x86_64.whl size=236289 sha256=2f125c921bff129c58703213e7e09403daec129c23a997f1f3c2bf7eb4eb8261
  Stored in directory: /root/.cache/pip/wheels/25/d4/61/dbd8edd1a0d656be7b4267c85db3b61951eb60016a0154a122
Successfully built pickle5
Installing collected packages: pickle5
Successfully installed pickle5-0.0.11


In [39]:
import pickle5 as pickle

In [74]:
#Another ray hyperparameters tuning
import time
import ray
start_time = time.time()
ray.shutdown()
ray.init(num_cpus=num_cpus, num_gpus=num_gpus)

print("Tuning started...")
analysis = flaml.tune.run(
    train_distilbert,
    search_alg=flaml.CFO(
        space=search_space,
        metric=HP_METRIC,
        mode=MODE,
        low_cost_partial_config={"num_train_epochs": 1}),
    # uncomment the following if scheduler = 'asha',
    # max_resource=max_num_epoch, min_resource=1,
    resources_per_trial={"gpu": num_gpus, "cpu": num_cpus},
    local_dir='logs/',
    num_samples=num_samples,
    time_budget_s=time_budget_s,
    use_ray=True,
)

ray.shutdown()



Tuning started...


[2m[36m(train_distilbert pid=1110)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
[2m[36m(train_distilbert pid=1110)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=1110)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=1110)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=1110)[0m {'train_runtime': 25.0235, 'train_samples_per_second': 67.976, 'train_steps_per_second': 8.512, 'train_loss': 0.04155403907310235, 'epoch': 1.0}


[2m[36m(train_distilbert pid=1110)[0m 
[2m[36m(train_distilbert pid=1110)[0m 
[2m[36m(train_distilbert pid=1110)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=1110)[0m 
[2m[36m(train_distilbert pid=1110)[0m 
[2m[36m(train_distilbert pid=1110)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=1110)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=1110)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=1110)[0m   Batch size = 8


Trial train_distilbert_0c5ec8d0 reported r2=0.04 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.5662610420278344e-06, 'adam_epsilon': 1.9702991167906198e-08, 'adam_beta1': 0.8689337534108345, 'adam_beta2': 0.9872898093714128, 'weight_decay': 0.22664606941097878}.
Trial train_distilbert_0c5ec8d0 completed. Last result: r2=0.037139874103649606
[2m[36m(train_distilbert pid=1110)[0m {'eval_loss': 0.034554753452539444, 'eval_mse': 0.034554753452539444, 'eval_rmse': 0.1858890950679779, 'eval_mae': 0.1528949737548828, 'eval_r2': 0.037139874103649606, 'eval_smape': 31.248000093562872, 'eval_runtime': 4.2961, 'eval_samples_per_second': 233.234, 'eval_steps_per_second': 29.329, 'epoch': 1.0}


[2m[36m(train_distilbert pid=1205)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=1205)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=1205)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=1205)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=1205)[0m {'train_runtime': 34.0475, 'train_samples_per_second': 72.155, 'train_steps_per_second': 9.046, 'train_loss': 0.03124229319683917, 'epoch': 1.45}


[2m[36m(train_distilbert pid=1205)[0m 
[2m[36m(train_distilbert pid=1205)[0m 
[2m[36m(train_distilbert pid=1205)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=1205)[0m 
[2m[36m(train_distilbert pid=1205)[0m 
[2m[36m(train_distilbert pid=1205)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=1205)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=1205)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=1205)[0m   Batch size = 8


Trial train_distilbert_10a85884 reported r2=0.19 with parameters={'num_train_epochs': 1.444265389543504, 'learning_rate': 3.4102391893542776e-06, 'adam_epsilon': 3.254028596506071e-08, 'adam_beta1': 0.8025861976630991, 'adam_beta2': 0.9952338248324752, 'weight_decay': 0.029945923065410704}.
Trial train_distilbert_10a85884 completed. Last result: r2=0.18726679500452392
[2m[36m(train_distilbert pid=1205)[0m {'eval_loss': 0.02916705794632435, 'eval_mse': 0.02916705794632435, 'eval_rmse': 0.1707836538553238, 'eval_mae': 0.13947932422161102, 'eval_r2': 0.18726679500452392, 'eval_smape': 28.776739489770456, 'eval_runtime': 4.2606, 'eval_samples_per_second': 235.176, 'eval_steps_per_second': 29.573, 'epoch': 1.45}


[2m[36m(train_distilbert pid=1304)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight']
[2m[36m(train_distilbert pid=1304)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=1304)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=1304)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=1304)[0m 
[2m[36m(train_distilbert pid=1304)[0m 
[2m[36m(train_distilbert pid=1304)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=1304)[0m 
[2m[36m(train_distilbert pid=1304)[0m 
[2m[36m(train_distilbert pid=1304)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=1304)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=1304)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=1304)[0m   Batch size = 8


[2m[36m(train_distilbert pid=1304)[0m {'train_runtime': 23.8673, 'train_samples_per_second': 71.269, 'train_steps_per_second': 8.924, 'train_loss': 0.05091224813685171, 'epoch': 1.0}


Trial train_distilbert_24aefc66 reported r2=0.01 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.489090831567398e-06, 'adam_epsilon': 3.163121807821295e-08, 'adam_beta1': 0.84537379213597, 'adam_beta2': 0.9887505670214453, 'weight_decay': 0.175998257587662}.
Trial train_distilbert_24aefc66 completed. Last result: r2=0.010822632746843475
[2m[36m(train_distilbert pid=1304)[0m {'eval_loss': 0.035499218851327896, 'eval_mse': 0.0354992151260376, 'eval_rmse': 0.1884123533964157, 'eval_mae': 0.1554177701473236, 'eval_r2': 0.010822632746843475, 'eval_smape': 31.680022766966065, 'eval_runtime': 4.2597, 'eval_samples_per_second': 235.229, 'eval_steps_per_second': 29.58, 'epoch': 1.0}


[2m[36m(train_distilbert pid=1399)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=1399)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=1399)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=1399)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=1399)[0m 
[2m[36m(train_distilbert pid=1399)[0m 
[2m[36m(train_distilbert pid=1399)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=1399)[0m 
[2m[36m(train_distilbert pid=1399)[0m 
[2m[36m(train_distilbert pid=1399)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=1399)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=1399)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=1399)[0m   Batch size = 8


[2m[36m(train_distilbert pid=1399)[0m {'train_runtime': 24.4809, 'train_samples_per_second': 70.98, 'train_steps_per_second': 8.905, 'train_loss': 0.03517600593216922, 'epoch': 1.02}


Trial train_distilbert_41e4cf86 reported r2=0.02 with parameters={'num_train_epochs': 1.021552403070911, 'learning_rate': 1.647430499046145e-06, 'adam_epsilon': 1.2272934289241964e-08, 'adam_beta1': 0.8924937146856989, 'adam_beta2': 0.9858312098115838, 'weight_decay': 0.2772938812342956}.
Trial train_distilbert_41e4cf86 completed. Last result: r2=0.02477961929278316
[2m[36m(train_distilbert pid=1399)[0m {'eval_loss': 0.0349983349442482, 'eval_mse': 0.0349983312189579, 'eval_rmse': 0.18707841634750366, 'eval_mae': 0.15451224148273468, 'eval_r2': 0.02477961929278316, 'eval_smape': 31.53005707335329, 'eval_runtime': 4.2569, 'eval_samples_per_second': 235.385, 'eval_steps_per_second': 29.599, 'epoch': 1.02}


[2m[36m(train_distilbert pid=1493)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=1493)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=1493)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=1493)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=1493)[0m {'train_runtime': 23.9074, 'train_samples_per_second': 71.149, 'train_steps_per_second': 8.909, 'train_loss': 0.04589774798899189, 'epoch': 1.0}


[2m[36m(train_distilbert pid=1493)[0m 
[2m[36m(train_distilbert pid=1493)[0m 
[2m[36m(train_distilbert pid=1493)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=1493)[0m 
[2m[36m(train_distilbert pid=1493)[0m 
[2m[36m(train_distilbert pid=1493)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=1493)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=1493)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=1493)[0m   Batch size = 8


Trial train_distilbert_59242c46 reported r2=0.06 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.36325263473191e-06, 'adam_epsilon': 3.8413113126853025e-08, 'adam_beta1': 0.8686254755729765, 'adam_beta2': 0.9857270860586153, 'weight_decay': 0.2796043772067232}.
Trial train_distilbert_59242c46 completed. Last result: r2=0.056840671586346314
[2m[36m(train_distilbert pid=1493)[0m {'eval_loss': 0.03384774178266525, 'eval_mse': 0.03384774178266525, 'eval_rmse': 0.18397755920886993, 'eval_mae': 0.15232370793819427, 'eval_r2': 0.056840671586346314, 'eval_smape': 31.128005707335326, 'eval_runtime': 4.2699, 'eval_samples_per_second': 234.664, 'eval_steps_per_second': 29.509, 'epoch': 1.0}


[2m[36m(train_distilbert pid=1587)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias']
[2m[36m(train_distilbert pid=1587)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=1587)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=1587)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=1587)[0m {'train_runtime': 26.7012, 'train_samples_per_second': 71.114, 'train_steps_per_second': 8.913, 'train_loss': 0.03870559339763738, 'epoch': 1.12}


[2m[36m(train_distilbert pid=1587)[0m 
[2m[36m(train_distilbert pid=1587)[0m 
[2m[36m(train_distilbert pid=1587)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=1587)[0m 
[2m[36m(train_distilbert pid=1587)[0m 
[2m[36m(train_distilbert pid=1587)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=1587)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=1587)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=1587)[0m   Batch size = 8


Trial train_distilbert_706258e2 reported r2=0.04 with parameters={'num_train_epochs': 1.1163034689325957, 'learning_rate': 1.7995003928648526e-06, 'adam_epsilon': 1.0106128594175562e-08, 'adam_beta1': 0.8692420312486924, 'adam_beta2': 0.9888550101490047, 'weight_decay': 0.1736877616152344}.
Trial train_distilbert_706258e2 completed. Last result: r2=0.042384363620348475
[2m[36m(train_distilbert pid=1587)[0m {'eval_loss': 0.03436654806137085, 'eval_mse': 0.03436654061079025, 'eval_rmse': 0.18538214266300201, 'eval_mae': 0.15381930768489838, 'eval_r2': 0.042384363620348475, 'eval_smape': 31.37971712824351, 'eval_runtime': 4.2763, 'eval_samples_per_second': 234.315, 'eval_steps_per_second': 29.465, 'epoch': 1.12}


[2m[36m(train_distilbert pid=1683)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
[2m[36m(train_distilbert pid=1683)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=1683)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=1683)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=1683)[0m {'train_runtime': 54.3289, 'train_samples_per_second': 72.323, 'train_steps_per_second': 9.074, 'train_loss': 0.03283338566097236, 'epoch': 2.31}


[2m[36m(train_distilbert pid=1683)[0m 
[2m[36m(train_distilbert pid=1683)[0m 
[2m[36m(train_distilbert pid=1683)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=1683)[0m 
[2m[36m(train_distilbert pid=1683)[0m 
[2m[36m(train_distilbert pid=1683)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=1683)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=1683)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=1683)[0m   Batch size = 8


Trial train_distilbert_87a3b280 reported r2=0.07 with parameters={'num_train_epochs': 2.3099532396348588, 'learning_rate': 1e-06, 'adam_epsilon': 5.7201310857638346e-08, 'adam_beta1': 0.8579860801688433, 'adam_beta2': 0.986129159247091, 'weight_decay': 0.28924743330906344}.
Trial train_distilbert_87a3b280 completed. Last result: r2=0.06624256172581422
[2m[36m(train_distilbert pid=1683)[0m {'eval_loss': 0.03351033106446266, 'eval_mse': 0.03351032733917236, 'eval_rmse': 0.1830582618713379, 'eval_mae': 0.15066848695278168, 'eval_r2': 0.06624256172581422, 'eval_smape': 30.8249711514471, 'eval_runtime': 4.2719, 'eval_samples_per_second': 234.557, 'eval_steps_per_second': 29.495, 'epoch': 2.31}


[2m[36m(train_distilbert pid=1791)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
[2m[36m(train_distilbert pid=1791)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=1791)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=1791)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=1791)[0m {'train_runtime': 23.9841, 'train_samples_per_second': 70.922, 'train_steps_per_second': 8.881, 'train_loss': 0.04286044304359687, 'epoch': 1.0}


[2m[36m(train_distilbert pid=1791)[0m 
[2m[36m(train_distilbert pid=1791)[0m 
[2m[36m(train_distilbert pid=1791)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=1791)[0m 
[2m[36m(train_distilbert pid=1791)[0m 
[2m[36m(train_distilbert pid=1791)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=1791)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=1791)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=1791)[0m   Batch size = 8


Trial train_distilbert_a4310376 reported r2=0.05 with parameters={'num_train_epochs': 1.0, 'learning_rate': 2.026885305542177e-06, 'adam_epsilon': 2.5796039251071995e-08, 'adam_beta1': 0.8792648709771098, 'adam_beta2': 0.9853251768069294, 'weight_decay': 0.26996132110438303}.
Trial train_distilbert_a4310376 completed. Last result: r2=0.04514798213503157
[2m[36m(train_distilbert pid=1791)[0m {'eval_loss': 0.0342673622071743, 'eval_mse': 0.0342673622071743, 'eval_rmse': 0.18511445820331573, 'eval_mae': 0.15292708575725555, 'eval_r2': 0.04514798213503157, 'eval_smape': 31.286618169910177, 'eval_runtime': 4.2736, 'eval_samples_per_second': 234.463, 'eval_steps_per_second': 29.483, 'epoch': 1.0}


[2m[36m(train_distilbert pid=1887)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
[2m[36m(train_distilbert pid=1887)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=1887)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=1887)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=1887)[0m 
[2m[36m(train_distilbert pid=1887)[0m 
[2m[36m(train_distilbert pid=1887)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=1887)[0m 
[2m[36m(train_distilbert pid=1887)[0m 
[2m[36m(train_distilbert pid=1887)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=1887)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=1887)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=1887)[0m   Batch size = 8


[2m[36m(train_distilbert pid=1887)[0m {'train_runtime': 34.3783, 'train_samples_per_second': 71.953, 'train_steps_per_second': 9.017, 'train_loss': 0.052340765922300275, 'epoch': 1.46}


Trial train_distilbert_c9cc61a2 reported r2=0.02 with parameters={'num_train_epochs': 1.4542238969895613, 'learning_rate': 1.0260471543820472e-06, 'adam_epsilon': 3.733671615049165e-08, 'adam_beta1': 0.8415869602943116, 'adam_beta2': 0.9853378872394445, 'weight_decay': 0.23731823916552766}.
Trial train_distilbert_c9cc61a2 completed. Last result: r2=0.016143638496434187
[2m[36m(train_distilbert pid=1887)[0m {'eval_loss': 0.03530826047062874, 'eval_mse': 0.03530826047062874, 'eval_rmse': 0.18790492415428162, 'eval_mae': 0.15494856238365173, 'eval_r2': 0.016143638496434187, 'eval_smape': 31.673527944111775, 'eval_runtime': 4.2641, 'eval_samples_per_second': 234.987, 'eval_steps_per_second': 29.549, 'epoch': 1.46}


[2m[36m(train_distilbert pid=1984)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight']
[2m[36m(train_distilbert pid=1984)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=1984)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=1984)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=1984)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=1984)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=1984)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=1984)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=1984)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=1984)[0m {'train_runtime': 87.324, 'train_samples_per_second': 71.474, 'train_steps_per_second': 8.955, 'train_loss': 0.04220191292140795, 'epoch': 3.67}


[2m[36m(train_distilbert pid=1984)[0m 
[2m[36m(train_distilbert pid=1984)[0m 
[2m[36m(train_distilbert pid=1984)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=1984)[0m 
[2m[36m(train_distilbert pid=1984)[0m 
[2m[36m(train_distilbert pid=1984)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=1984)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=1984)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=1984)[0m   Batch size = 8


Trial train_distilbert_e10cee68 reported r2=0.07 with parameters={'num_train_epochs': 3.6692313888841848, 'learning_rate': 1e-06, 'adam_epsilon': 8.763464763863791e-08, 'adam_beta1': 0.8743852000433749, 'adam_beta2': 0.986921066682846, 'weight_decay': 0.3}.
Trial train_distilbert_e10cee68 completed. Last result: r2=0.07060210210068585
[2m[36m(train_distilbert pid=1984)[0m {'eval_loss': 0.03335386887192726, 'eval_mse': 0.03335387259721756, 'eval_rmse': 0.18263041973114014, 'eval_mae': 0.15088699758052826, 'eval_r2': 0.07060210210068585, 'eval_smape': 30.825700162175647, 'eval_runtime': 4.2691, 'eval_samples_per_second': 234.712, 'eval_steps_per_second': 29.515, 'epoch': 3.67}


[2m[36m(train_distilbert pid=2101)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
[2m[36m(train_distilbert pid=2101)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=2101)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=2101)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=2101)[0m {'train_runtime': 26.1093, 'train_samples_per_second': 71.444, 'train_steps_per_second': 8.962, 'train_loss': 0.06281854759933603, 'epoch': 1.1}


[2m[36m(train_distilbert pid=2101)[0m 
[2m[36m(train_distilbert pid=2101)[0m 
[2m[36m(train_distilbert pid=2101)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=2101)[0m 
[2m[36m(train_distilbert pid=2101)[0m 
[2m[36m(train_distilbert pid=2101)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=2101)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=2101)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=2101)[0m   Batch size = 8


Trial train_distilbert_fe415550 reported r2=0.00 with parameters={'num_train_epochs': 1.0966263686353561, 'learning_rate': 1.1382765822317426e-06, 'adam_epsilon': 8.198105233308247e-08, 'adam_beta1': 0.8776292834437661, 'adam_beta2': 0.9850137491039844, 'weight_decay': 0.3}.
Trial train_distilbert_fe415550 completed. Last result: r2=0.001162664344729336
[2m[36m(train_distilbert pid=2101)[0m {'eval_loss': 0.035845886915922165, 'eval_mse': 0.035845886915922165, 'eval_rmse': 0.1893301010131836, 'eval_mae': 0.15668334066867828, 'eval_r2': 0.001162664344729336, 'eval_smape': 31.924927488772454, 'eval_runtime': 4.2801, 'eval_samples_per_second': 234.104, 'eval_steps_per_second': 29.438, 'epoch': 1.1}


[2m[36m(train_distilbert pid=2192)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=2192)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=2192)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=2192)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=2192)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=2192)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=2192)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=2192)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=2192)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=2192)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=2192)[0m Configuration saved in ./checkpoint-1000/config.json


[2m[36m(train_distilbert pid=2192)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=2192)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=2192)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=2192)[0m Saving model checkpoint to ./checkpoint-1500
[2m[36m(train_distilbert pid=2192)[0m Configuration saved in ./checkpoint-1500/config.json
[2m[36m(train_distilbert pid=2192)[0m Model weights saved in ./checkpoint-1500/pytorch_model.bin
[2m[36m(train_distilbert pid=2192)[0m tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
[2m[36m(train_distilbert pid=2192)[0m Special tokens file saved in ./checkpoint-1500/special_tokens_map.json


[2m[36m(train_distilbert pid=2192)[0m {'train_runtime': 184.4325, 'train_samples_per_second': 71.283, 'train_steps_per_second': 8.93, 'train_loss': 0.025239030737260217, 'epoch': 7.73}


[2m[36m(train_distilbert pid=2192)[0m 
[2m[36m(train_distilbert pid=2192)[0m 
[2m[36m(train_distilbert pid=2192)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=2192)[0m 
[2m[36m(train_distilbert pid=2192)[0m 
[2m[36m(train_distilbert pid=2192)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=2192)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=2192)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=2192)[0m   Batch size = 8


Trial train_distilbert_3e8fb034 reported r2=0.33 with parameters={'num_train_epochs': 7.72893409837498, 'learning_rate': 1e-06, 'adam_epsilon': 6.114604019854072e-08, 'adam_beta1': 0.8547419967684521, 'adam_beta2': 0.9880386366347627, 'weight_decay': 0.27462414799443063}.
Trial train_distilbert_3e8fb034 completed. Last result: r2=0.334183806127616
[2m[36m(train_distilbert pid=2192)[0m {'eval_loss': 0.023894555866718292, 'eval_mse': 0.023894555866718292, 'eval_rmse': 0.15457864105701447, 'eval_mae': 0.12322226166725159, 'eval_r2': 0.334183806127616, 'eval_smape': 25.63610669286427, 'eval_runtime': 4.2919, 'eval_samples_per_second': 233.461, 'eval_steps_per_second': 29.357, 'epoch': 7.73}


[2m[36m(train_distilbert pid=2334)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias']
[2m[36m(train_distilbert pid=2334)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=2334)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=2334)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=2334)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=2334)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=2334)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=2334)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=2334)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=2334)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=2334)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=2334)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=2334)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=2334)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=2334)[0m 
[2m[36m(train_distilbert pid=2334)[0m 
[2m[36m(train_distilbert pid=2334)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=2334)[0m 
[2m[36m(train_distilbert pid=2334)[0m 
[2m[36m(train_distilbert pid=2334)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=2334)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=2334)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=2334)[0m   Batch size = 8


[2m[36m(train_distilbert pid=2334)[0m {'train_runtime': 142.6887, 'train_samples_per_second': 71.563, 'train_steps_per_second': 8.964, 'train_loss': 0.02672868300639996, 'epoch': 6.0}


Trial train_distilbert_57018f5c reported r2=0.35 with parameters={'num_train_epochs': 6.003069943107681, 'learning_rate': 1.50645083500445e-06, 'adam_epsilon': 5.416348580474996e-08, 'adam_beta1': 0.8493206861486333, 'adam_beta2': 0.9875244686004083, 'weight_decay': 0.3}.
Trial train_distilbert_57018f5c completed. Last result: r2=0.3546552463943814
[2m[36m(train_distilbert pid=2334)[0m {'eval_loss': 0.02315988391637802, 'eval_mse': 0.02315988391637802, 'eval_rmse': 0.15218371152877808, 'eval_mae': 0.12004639208316803, 'eval_r2': 0.3546552463943814, 'eval_smape': 25.07169255239521, 'eval_runtime': 4.26, 'eval_samples_per_second': 235.212, 'eval_steps_per_second': 29.578, 'epoch': 6.0}


[2m[36m(train_distilbert pid=2454)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=2454)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=2454)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=2454)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=2454)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=2454)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=2454)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=2454)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=2454)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=2454)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=2454)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=2454)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=2454)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=2454)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=2454)[0m {'train_runtime': 112.9694, 'train_samples_per_second': 71.132, 'train_steps_per_second': 8.914, 'train_loss': 0.037146321119594476, 'epoch': 4.73}


[2m[36m(train_distilbert pid=2454)[0m 
[2m[36m(train_distilbert pid=2454)[0m 
[2m[36m(train_distilbert pid=2454)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=2454)[0m 
[2m[36m(train_distilbert pid=2454)[0m 
[2m[36m(train_distilbert pid=2454)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=2454)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=2454)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=2454)[0m   Batch size = 8


Trial train_distilbert_ca1ea97a reported r2=0.19 with parameters={'num_train_epochs': 4.7241241340082905, 'learning_rate': 1e-06, 'adam_epsilon': 9.89321792658179e-08, 'adam_beta1': 0.8798065106631938, 'adam_beta2': 0.9874349205477921, 'weight_decay': 0.24643772525445523}.
Trial train_distilbert_ca1ea97a completed. Last result: r2=0.18547733511604936
[2m[36m(train_distilbert pid=2454)[0m {'eval_loss': 0.029231274500489235, 'eval_mse': 0.029231274500489235, 'eval_rmse': 0.1709715574979782, 'eval_mae': 0.14045915007591248, 'eval_r2': 0.18547733511604936, 'eval_smape': 28.978301210079838, 'eval_runtime': 4.272, 'eval_samples_per_second': 234.55, 'eval_steps_per_second': 29.494, 'epoch': 4.73}


[2m[36m(train_distilbert pid=2567)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=2567)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=2567)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=2567)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=2567)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=2567)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=2567)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=2567)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=2567)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=2567)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=2567)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=2567)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=2567)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=2567)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=2567)[0m {'train_runtime': 139.1618, 'train_samples_per_second': 71.737, 'train_steps_per_second': 8.99, 'train_loss': 0.018238863499044515, 'epoch': 5.87}


[2m[36m(train_distilbert pid=2567)[0m 
[2m[36m(train_distilbert pid=2567)[0m 
[2m[36m(train_distilbert pid=2567)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=2567)[0m 
[2m[36m(train_distilbert pid=2567)[0m 
[2m[36m(train_distilbert pid=2567)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=2567)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=2567)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=2567)[0m   Batch size = 8


Trial train_distilbert_27ba9580 reported r2=0.44 with parameters={'num_train_epochs': 5.868951009731138, 'learning_rate': 4.094382330037184e-06, 'adam_epsilon': 7.004462571334688e-08, 'adam_beta1': 0.8590255024750992, 'adam_beta2': 0.9888678055351526, 'weight_decay': 0.3}.
Trial train_distilbert_27ba9580 completed. Last result: r2=0.4367415338766143
[2m[36m(train_distilbert pid=2567)[0m {'eval_loss': 0.020214004442095757, 'eval_mse': 0.020214004442095757, 'eval_rmse': 0.14217595756053925, 'eval_mae': 0.11255673319101334, 'eval_r2': 0.4367415338766143, 'eval_smape': 23.82804898016467, 'eval_runtime': 4.2701, 'eval_samples_per_second': 234.657, 'eval_steps_per_second': 29.508, 'epoch': 5.87}


[2m[36m(train_distilbert pid=2686)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=2686)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=2686)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=2686)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=2686)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=2686)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=2686)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=2686)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=2686)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=2686)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=2686)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=2686)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=2686)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=2686)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=2686)[0m 
[2m[36m(train_distilbert pid=2686)[0m 
[2m[36m(train_distilbert pid=2686)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=2686)[0m 
[2m[36m(train_distilbert pid=2686)[0m 
[2m[36m(train_distilbert pid=2686)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=2686)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=2686)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=2686)[0m   Batch size = 8


[2m[36m(train_distilbert pid=2686)[0m {'train_runtime': 145.6167, 'train_samples_per_second': 71.726, 'train_steps_per_second': 8.982, 'train_loss': 0.034216186686757875, 'epoch': 6.14}


Trial train_distilbert_7406cbb6 reported r2=0.18 with parameters={'num_train_epochs': 6.140253800396561, 'learning_rate': 1e-06, 'adam_epsilon': 4.1883059044776154e-08, 'adam_beta1': 0.8396158698221674, 'adam_beta2': 0.9861829565345798, 'weight_decay': 0.28490524614359275}.
Trial train_distilbert_7406cbb6 completed. Last result: r2=0.17762318238625008
[2m[36m(train_distilbert pid=2686)[0m {'eval_loss': 0.02951314114034176, 'eval_mse': 0.02951314114034176, 'eval_rmse': 0.1717938929796219, 'eval_mae': 0.1399141401052475, 'eval_r2': 0.17762318238625008, 'eval_smape': 28.869507079590814, 'eval_runtime': 4.269, 'eval_samples_per_second': 234.717, 'eval_steps_per_second': 29.515, 'epoch': 6.14}


[2m[36m(train_distilbert pid=2808)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=2808)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=2808)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=2808)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=2808)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=2808)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=2808)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=2808)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=2808)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=2808)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=2808)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=2808)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=2808)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=2808)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=2808)[0m 
[2m[36m(train_distilbert pid=2808)[0m 
[2m[36m(train_distilbert pid=2808)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=2808)[0m 
[2m[36m(train_distilbert pid=2808)[0m 
[2m[36m(train_distilbert pid=2808)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=2808)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=2808)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=2808)[0m   Batch size = 8


[2m[36m(train_distilbert pid=2808)[0m {'train_runtime': 117.2801, 'train_samples_per_second': 71.285, 'train_steps_per_second': 8.927, 'train_loss': 0.016347200162317374, 'epoch': 4.92}
Trial train_distilbert_d06d9ca4 reported r2=0.43 with parameters={'num_train_epochs': 4.9149368041047286, 'learning_rate': 9.641496199313686e-06, 'adam_epsilon': 1e-07, 'adam_beta1': 0.8483437360228483, 'adam_beta2': 0.9882966224298836, 'weight_decay': 0.26852224944844605}.
Trial train_distilbert_d06d9ca4 completed. Last result: r2=0.4253203265087382
[2m[36m(train_distilbert pid=2808)[0m {'eval_loss': 0.020623883232474327, 'eval_mse': 0.020623883232474327, 'eval_rmse': 0.1436101794242859, 'eval_mae': 0.11474476754665375, 'eval_r2': 0.4253203265087382, 'eval_smape': 24.343884496631734, 'eval_runtime': 4.2617, 'eval_samples_per_second': 235.117, 'eval_steps_per_second': 29.566, 'epoch': 4.92}


[2m[36m(train_distilbert pid=2930)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=2930)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=2930)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=2930)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=2930)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=2930)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=2930)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=2930)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=2930)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=2930)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=2930)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=2930)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=2930)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=2930)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=2930)[0m {'train_runtime': 166.0892, 'train_samples_per_second': 71.774, 'train_steps_per_second': 8.989, 'train_loss': 0.023865821771947474, 'epoch': 7.01}


[2m[36m(train_distilbert pid=2930)[0m 
[2m[36m(train_distilbert pid=2930)[0m 
[2m[36m(train_distilbert pid=2930)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=2930)[0m 
[2m[36m(train_distilbert pid=2930)[0m 
[2m[36m(train_distilbert pid=2930)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=2930)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=2930)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=2930)[0m   Batch size = 8


Trial train_distilbert_2fcc9b64 reported r2=0.40 with parameters={'num_train_epochs': 7.008144219851947, 'learning_rate': 1.7387308274533194e-06, 'adam_epsilon': 4.570245505406172e-08, 'adam_beta1': 0.8697072689273502, 'adam_beta2': 0.9894393187540054, 'weight_decay': 0.3}.
Trial train_distilbert_2fcc9b64 completed. Last result: r2=0.40161350205184176
[2m[36m(train_distilbert pid=2930)[0m {'eval_loss': 0.02147466503083706, 'eval_mse': 0.02147466503083706, 'eval_rmse': 0.14654237031936646, 'eval_mae': 0.11640966683626175, 'eval_r2': 0.40161350205184176, 'eval_smape': 24.46438763098802, 'eval_runtime': 4.2563, 'eval_samples_per_second': 235.414, 'eval_steps_per_second': 29.603, 'epoch': 7.01}


[2m[36m(train_distilbert pid=3057)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=3057)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3057)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3057)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3057)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=3057)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=3057)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=3057)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=3057)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=3057)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=3057)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=3057)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=3057)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=3057)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=3057)[0m Saving model checkpoint to ./checkpoint-1500
[2m[36m(train_distilbert pid=3057)[0m Configuration saved in ./checkpoint-1500/config.json


[2m[36m(train_distilbert pid=3057)[0m Model weights saved in ./checkpoint-1500/pytorch_model.bin
[2m[36m(train_distilbert pid=3057)[0m tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
[2m[36m(train_distilbert pid=3057)[0m Special tokens file saved in ./checkpoint-1500/special_tokens_map.json


[2m[36m(train_distilbert pid=3057)[0m 
[2m[36m(train_distilbert pid=3057)[0m 
[2m[36m(train_distilbert pid=3057)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3057)[0m 
[2m[36m(train_distilbert pid=3057)[0m 
[2m[36m(train_distilbert pid=3057)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3057)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3057)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=3057)[0m   Batch size = 8


[2m[36m(train_distilbert pid=3057)[0m {'train_runtime': 180.8975, 'train_samples_per_second': 71.473, 'train_steps_per_second': 8.955, 'train_loss': 0.022247095461244936, 'epoch': 7.61}


Trial train_distilbert_81f560b0 reported r2=0.40 with parameters={'num_train_epochs': 7.6010398992300585, 'learning_rate': 1.7484331802653309e-06, 'adam_epsilon': 5.742409719328301e-08, 'adam_beta1': 0.8774054954333854, 'adam_beta2': 0.9903104761331767, 'weight_decay': 0.3}.
Trial train_distilbert_81f560b0 completed. Last result: r2=0.3981462819754775
[2m[36m(train_distilbert pid=3057)[0m {'eval_loss': 0.021599095314741135, 'eval_mse': 0.021599093452095985, 'eval_rmse': 0.14696629345417023, 'eval_mae': 0.11671100556850433, 'eval_r2': 0.3981462819754775, 'eval_smape': 24.496296469560875, 'eval_runtime': 4.2497, 'eval_samples_per_second': 235.781, 'eval_steps_per_second': 29.649, 'epoch': 7.61}


[2m[36m(train_distilbert pid=3188)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight']
[2m[36m(train_distilbert pid=3188)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3188)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3188)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3188)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=3188)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=3188)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=3188)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=3188)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=3188)[0m 
[2m[36m(train_distilbert pid=3188)[0m 
[2m[36m(train_distilbert pid=3188)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3188)[0m 
[2m[36m(train_distilbert pid=3188)[0m 
[2m[36m(train_distilbert pid=3188)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3188)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3188)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=3188)[0m   Batch size = 8


[2m[36m(train_distilbert pid=3188)[0m {'train_runtime': 106.8943, 'train_samples_per_second': 72.11, 'train_steps_per_second': 9.037, 'train_loss': 0.016071420278608427, 'epoch': 4.54}
Trial train_distilbert_ea5fd572 reported r2=0.46 with parameters={'num_train_epochs': 4.53156231400827, 'learning_rate': 9.587993898615405e-06, 'adam_epsilon': 8.54388633191563e-08, 'adam_beta1': 0.8406455095168129, 'adam_beta2': 0.9874272365996927, 'weight_decay': 0.27772053841008615}.
Trial train_distilbert_ea5fd572 completed. Last result: r2=0.4619531054562276
[2m[36m(train_distilbert pid=3188)[0m {'eval_loss': 0.01930922083556652, 'eval_mse': 0.01930922083556652, 'eval_rmse': 0.13895761966705322, 'eval_mae': 0.11124712973833084, 'eval_r2': 0.4619531054562276, 'eval_smape': 23.69484468562874, 'eval_runtime': 4.2637, 'eval_samples_per_second': 235.01, 'eval_steps_per_second': 29.552, 'epoch': 4.54}


[2m[36m(train_distilbert pid=3296)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=3296)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3296)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3296)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3296)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=3296)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=3296)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=3296)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=3296)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=3296)[0m 
[2m[36m(train_distilbert pid=3296)[0m 
[2m[36m(train_distilbert pid=3296)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3296)[0m 
[2m[36m(train_distilbert pid=3296)[0m 
[2m[36m(train_distilbert pid=3296)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3296)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3296)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=3296)[0m   Batch size = 8


[2m[36m(train_distilbert pid=3296)[0m {'train_runtime': 87.1677, 'train_samples_per_second': 71.829, 'train_steps_per_second': 9.006, 'train_loss': 0.019424771351419436, 'epoch': 3.69}
Trial train_distilbert_5f35d43c reported r2=0.44 with parameters={'num_train_epochs': 3.6808813364134605, 'learning_rate': 5.720856000773118e-06, 'adam_epsilon': 6.604555387986216e-08, 'adam_beta1': 0.8829334196974978, 'adam_beta2': 0.9883459496647106, 'weight_decay': 0.3}.
Trial train_distilbert_5f35d43c completed. Last result: r2=0.44182905661941796
[2m[36m(train_distilbert pid=3296)[0m {'eval_loss': 0.02003142423927784, 'eval_mse': 0.02003142423927784, 'eval_rmse': 0.14153242111206055, 'eval_mae': 0.11180277913808823, 'eval_r2': 0.44182905661941796, 'eval_smape': 23.61506284306387, 'eval_runtime': 4.2633, 'eval_samples_per_second': 235.032, 'eval_steps_per_second': 29.555, 'epoch': 3.69}


[2m[36m(train_distilbert pid=3400)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
[2m[36m(train_distilbert pid=3400)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3400)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3400)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3400)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=3400)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=3400)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=3400)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=3400)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=3400)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=3400)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=3400)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=3400)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=3400)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=3400)[0m Saving model checkpoint to ./checkpoint-1500
[2m[36m(train_distilbert pid=3400)[0m Configuration saved in ./checkpoint-1500/config.json
[2m[36m(train_distilbert pid=3400)[0m Model weights saved in ./checkpoint-1500/pytorch_model.bin
[2m[36m(train_distilbert pid=3400)[0m tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
[2m[36m(train_distilbert pid=3400)[0m Special tokens file saved in ./checkpoint-1500/special_tokens_map.json


[2m[36m(train_distilbert pid=3400)[0m 
[2m[36m(train_distilbert pid=3400)[0m 
[2m[36m(train_distilbert pid=3400)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3400)[0m 
[2m[36m(train_distilbert pid=3400)[0m 
[2m[36m(train_distilbert pid=3400)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3400)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3400)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=3400)[0m   Batch size = 8


[2m[36m(train_distilbert pid=3400)[0m {'train_runtime': 171.771, 'train_samples_per_second': 71.55, 'train_steps_per_second': 8.96, 'train_loss': 0.014447640412189962, 'epoch': 7.23}
Trial train_distilbert_ab660246 reported r2=0.46 with parameters={'num_train_epochs': 7.225312306419607, 'learning_rate': 6.862069731118919e-06, 'adam_epsilon': 9.061220401679211e-08, 'adam_beta1': 0.8167375922944143, 'adam_beta2': 0.9879486073810761, 'weight_decay': 0.2298569248747562}.
Trial train_distilbert_ab660246 completed. Last result: r2=0.45810805317472825
[2m[36m(train_distilbert pid=3400)[0m {'eval_loss': 0.019447211176156998, 'eval_mse': 0.019447211176156998, 'eval_rmse': 0.13945326209068298, 'eval_mae': 0.11102928221225739, 'eval_r2': 0.45810805317472825, 'eval_smape': 23.622448462450098, 'eval_runtime': 4.2487, 'eval_samples_per_second': 235.837, 'eval_steps_per_second': 29.656, 'epoch': 7.23}


[2m[36m(train_distilbert pid=3532)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
[2m[36m(train_distilbert pid=3532)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3532)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3532)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3532)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=3532)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=3532)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=3532)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=3532)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=3532)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=3532)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=3532)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=3532)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=3532)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=3532)[0m 
[2m[36m(train_distilbert pid=3532)[0m 
[2m[36m(train_distilbert pid=3532)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3532)[0m 
[2m[36m(train_distilbert pid=3532)[0m 
[2m[36m(train_distilbert pid=3532)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3532)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3532)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=3532)[0m   Batch size = 8


[2m[36m(train_distilbert pid=3532)[0m {'train_runtime': 142.6829, 'train_samples_per_second': 71.842, 'train_steps_per_second': 8.999, 'train_loss': 0.011433945266628562, 'epoch': 6.03}


Trial train_distilbert_e830a398 reported r2=0.47 with parameters={'num_train_epochs': 6.026256911984053, 'learning_rate': 1.692812256685079e-05, 'adam_epsilon': 6.339335213316857e-08, 'adam_beta1': 0.827899706245196, 'adam_beta2': 0.9861635583059666, 'weight_decay': 0.3}.
Trial train_distilbert_e830a398 completed. Last result: r2=0.4660829275337999
[2m[36m(train_distilbert pid=3532)[0m {'eval_loss': 0.019161012023687363, 'eval_mse': 0.019161012023687363, 'eval_rmse': 0.13842330873012543, 'eval_mae': 0.11013011634349823, 'eval_r2': 0.4660829275337999, 'eval_smape': 23.450978901571855, 'eval_runtime': 4.2519, 'eval_samples_per_second': 235.657, 'eval_steps_per_second': 29.633, 'epoch': 6.03}


[2m[36m(train_distilbert pid=3651)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight']
[2m[36m(train_distilbert pid=3651)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3651)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3651)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3651)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=3651)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=3651)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=3651)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=3651)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=3651)[0m {'train_runtime': 80.8436, 'train_samples_per_second': 71.698, 'train_steps_per_second': 8.98, 'train_loss': 0.020887240890629034, 'epoch': 3.41}


[2m[36m(train_distilbert pid=3651)[0m 
[2m[36m(train_distilbert pid=3651)[0m 
[2m[36m(train_distilbert pid=3651)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3651)[0m 
[2m[36m(train_distilbert pid=3651)[0m 
[2m[36m(train_distilbert pid=3651)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3651)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3651)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=3651)[0m   Batch size = 8


Trial train_distilbert_539cf8e8 reported r2=0.42 with parameters={'num_train_epochs': 3.4075973370639363, 'learning_rate': 5.4305860934576294e-06, 'adam_epsilon': 1e-07, 'adam_beta1': 0.8533913127884298, 'adam_beta2': 0.988692534181433, 'weight_decay': 0.22814654027452233}.
Trial train_distilbert_539cf8e8 completed. Last result: r2=0.41703135707640115
[2m[36m(train_distilbert pid=3651)[0m {'eval_loss': 0.020921355113387108, 'eval_mse': 0.020921355113387108, 'eval_rmse': 0.1446421593427658, 'eval_mae': 0.11375291645526886, 'eval_r2': 0.41703135707640115, 'eval_smape': 23.881775511477045, 'eval_runtime': 4.2609, 'eval_samples_per_second': 235.159, 'eval_steps_per_second': 29.571, 'epoch': 3.41}


[2m[36m(train_distilbert pid=3754)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
[2m[36m(train_distilbert pid=3754)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3754)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3754)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3754)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=3754)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=3754)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=3754)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=3754)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=3754)[0m 
[2m[36m(train_distilbert pid=3754)[0m 
[2m[36m(train_distilbert pid=3754)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3754)[0m 
[2m[36m(train_distilbert pid=3754)[0m 
[2m[36m(train_distilbert pid=3754)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3754)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3754)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=3754)[0m   Batch size = 8


[2m[36m(train_distilbert pid=3754)[0m {'train_runtime': 103.0661, 'train_samples_per_second': 72.007, 'train_steps_per_second': 9.023, 'train_loss': 0.014452069805514427, 'epoch': 4.37}


Trial train_distilbert_b132615a reported r2=0.46 with parameters={'num_train_epochs': 4.363024452395542, 'learning_rate': 1.5428958522565375e-05, 'adam_epsilon': 1e-07, 'adam_beta1': 0.8448931347532064, 'adam_beta2': 0.9852772046282663, 'weight_decay': 0.3}.
Trial train_distilbert_b132615a completed. Last result: r2=0.4622734862771509
[2m[36m(train_distilbert pid=3754)[0m {'eval_loss': 0.01929772086441517, 'eval_mse': 0.01929772086441517, 'eval_rmse': 0.1389162391424179, 'eval_mae': 0.11077301949262619, 'eval_r2': 0.4622734862771509, 'eval_smape': 23.411544099301395, 'eval_runtime': 4.2495, 'eval_samples_per_second': 235.791, 'eval_steps_per_second': 29.65, 'epoch': 4.37}


[2m[36m(train_distilbert pid=3862)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight']
[2m[36m(train_distilbert pid=3862)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3862)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3862)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3862)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=3862)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=3862)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=3862)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=3862)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=3862)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=3862)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=3862)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=3862)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=3862)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=3862)[0m Saving model checkpoint to ./checkpoint-1500
[2m[36m(train_distilbert pid=3862)[0m Configuration saved in ./checkpoint-1500/config.json
[2m[36m(train_distilbert pid=3862)[0m Model weights saved in ./checkpoint-1500/pytorch_model.bin
[2m[36m(train_distilbert pid=3862)[0m tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
[2m[36m(train_distilbert pid=3862)[0m Special tokens file saved in ./checkpoint-1500/special_tokens_map.json


[2m[36m(train_distilbert pid=3862)[0m {'train_runtime': 196.7761, 'train_samples_per_second': 71.951, 'train_steps_per_second': 9.01, 'train_loss': 0.01053782794950241, 'epoch': 8.32}


[2m[36m(train_distilbert pid=3862)[0m 
[2m[36m(train_distilbert pid=3862)[0m 
[2m[36m(train_distilbert pid=3862)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3862)[0m 
[2m[36m(train_distilbert pid=3862)[0m 
[2m[36m(train_distilbert pid=3862)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3862)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3862)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=3862)[0m   Batch size = 8


Trial train_distilbert_ede1c2f8 reported r2=0.49 with parameters={'num_train_epochs': 8.323531707299102, 'learning_rate': 1.857295378811325e-05, 'adam_epsilon': 2.4916439803066257e-08, 'adam_beta1': 0.8109062777371857, 'adam_beta2': 0.9870507093459101, 'weight_decay': 0.2851666696787331}.
Trial train_distilbert_ede1c2f8 completed. Last result: r2=0.48670043794254747
[2m[36m(train_distilbert pid=3862)[0m {'eval_loss': 0.018421098589897156, 'eval_mse': 0.018421098589897156, 'eval_rmse': 0.13572435081005096, 'eval_mae': 0.10785385966300964, 'eval_r2': 0.48670043794254747, 'eval_smape': 22.908405844560875, 'eval_runtime': 4.2597, 'eval_samples_per_second': 235.226, 'eval_steps_per_second': 29.579, 'epoch': 8.32}


[2m[36m(train_distilbert pid=4005)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=4005)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=4005)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=4005)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=4005)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=4005)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=4005)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=4005)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=4005)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=4005)[0m {'train_runtime': 104.8026, 'train_samples_per_second': 72.301, 'train_steps_per_second': 9.055, 'train_loss': 0.015503368839951286, 'epoch': 4.46}


[2m[36m(train_distilbert pid=4005)[0m 
[2m[36m(train_distilbert pid=4005)[0m 
[2m[36m(train_distilbert pid=4005)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=4005)[0m 
[2m[36m(train_distilbert pid=4005)[0m 
[2m[36m(train_distilbert pid=4005)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=4005)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=4005)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=4005)[0m   Batch size = 8


Trial train_distilbert_30c0a882 reported r2=0.47 with parameters={'num_train_epochs': 4.454643294087422, 'learning_rate': 1.1816279118068556e-05, 'adam_epsilon': 7.03034681123448e-08, 'adam_beta1': 0.8549775285112468, 'adam_beta2': 0.9882278110202043, 'weight_decay': 0.3}.
Trial train_distilbert_30c0a882 completed. Last result: r2=0.4739528633152623
[2m[36m(train_distilbert pid=4005)[0m {'eval_loss': 0.018878575414419174, 'eval_mse': 0.018878577277064323, 'eval_rmse': 0.1373993307352066, 'eval_mae': 0.10944394022226334, 'eval_r2': 0.4739528633152623, 'eval_smape': 23.33355554515968, 'eval_runtime': 4.2466, 'eval_samples_per_second': 235.954, 'eval_steps_per_second': 29.671, 'epoch': 4.46}


[2m[36m(train_distilbert pid=4113)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight']
[2m[36m(train_distilbert pid=4113)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=4113)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=4113)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=4113)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=4113)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=4113)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=4113)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=4113)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=4113)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=4113)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=4113)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=4113)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=4113)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=4113)[0m Saving model checkpoint to ./checkpoint-1500
[2m[36m(train_distilbert pid=4113)[0m Configuration saved in ./checkpoint-1500/config.json
[2m[36m(train_distilbert pid=4113)[0m Model weights saved in ./checkpoint-1500/pytorch_model.bin
[2m[36m(train_distilbert pid=4113)[0m tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
[2m[36m(train_distilbert pid=4113)[0m Special tokens file saved in ./checkpoint-1500/special_tokens_map.json


[2m[36m(train_distilbert pid=4113)[0m Saving model checkpoint to ./checkpoint-2000
[2m[36m(train_distilbert pid=4113)[0m Configuration saved in ./checkpoint-2000/config.json
[2m[36m(train_distilbert pid=4113)[0m Model weights saved in ./checkpoint-2000/pytorch_model.bin
[2m[36m(train_distilbert pid=4113)[0m tokenizer config file saved in ./checkpoint-2000/tokenizer_config.json
[2m[36m(train_distilbert pid=4113)[0m Special tokens file saved in ./checkpoint-2000/special_tokens_map.json


[2m[36m(train_distilbert pid=4113)[0m 
[2m[36m(train_distilbert pid=4113)[0m 
[2m[36m(train_distilbert pid=4113)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=4113)[0m 
[2m[36m(train_distilbert pid=4113)[0m 
[2m[36m(train_distilbert pid=4113)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=4113)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=4113)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=4113)[0m   Batch size = 8


[2m[36m(train_distilbert pid=4113)[0m {'train_runtime': 265.9176, 'train_samples_per_second': 72.028, 'train_steps_per_second': 9.022, 'train_loss': 0.0075857107616454774, 'epoch': 11.26}


Trial train_distilbert_ae879410 reported r2=0.48 with parameters={'num_train_epochs': 11.26010258774387, 'learning_rate': 2.6607803946749363e-05, 'adam_epsilon': 2.24674071528958e-08, 'adam_beta1': 0.8, 'adam_beta2': 0.9849889154122279, 'weight_decay': 0.2460607693638045}.
Trial train_distilbert_ae879410 completed. Last result: r2=0.4795064066909146
[2m[36m(train_distilbert pid=4113)[0m {'eval_loss': 0.01867927424609661, 'eval_mse': 0.01867927424609661, 'eval_rmse': 0.13667213916778564, 'eval_mae': 0.10797936469316483, 'eval_r2': 0.4795064066909146, 'eval_smape': 23.001799136102793, 'eval_runtime': 4.2528, 'eval_samples_per_second': 235.608, 'eval_steps_per_second': 29.627, 'epoch': 11.26}


[2m[36m(train_distilbert pid=4266)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight']
[2m[36m(train_distilbert pid=4266)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=4266)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=4266)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=4266)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=4266)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=4266)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=4266)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=4266)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


[2m[36m(train_distilbert pid=4266)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=4266)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=4266)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=4266)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=4266)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=4266)[0m Saving model checkpoint to ./checkpoint-1500
[2m[36m(train_distilbert pid=4266)[0m Configuration saved in ./checkpoint-1500/config.json
[2m[36m(train_distilbert pid=4266)[0m Model weights saved in ./checkpoint-1500/pytorch_model.bin
[2m[36m(train_distilbert pid=4266)[0m tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
[2m[36m(train_distilbert pid=4266)[0m Special tokens file saved in ./checkpoint-1500/special_tokens_map.json


[2m[36m(train_distilbert pid=4266)[0m Saving model checkpoint to ./checkpoint-2000
[2m[36m(train_distilbert pid=4266)[0m Configuration saved in ./checkpoint-2000/config.json


[2m[36m(train_distilbert pid=4266)[0m Model weights saved in ./checkpoint-2000/pytorch_model.bin
[2m[36m(train_distilbert pid=4266)[0m tokenizer config file saved in ./checkpoint-2000/tokenizer_config.json
[2m[36m(train_distilbert pid=4266)[0m Special tokens file saved in ./checkpoint-2000/special_tokens_map.json


[2m[36m(train_distilbert pid=4266)[0m {'train_runtime': 234.1934, 'train_samples_per_second': 71.741, 'train_steps_per_second': 8.984, 'train_loss': 0.009393422775848736, 'epoch': 9.88}


[2m[36m(train_distilbert pid=4266)[0m 
[2m[36m(train_distilbert pid=4266)[0m 
[2m[36m(train_distilbert pid=4266)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=4266)[0m 
[2m[36m(train_distilbert pid=4266)[0m 
[2m[36m(train_distilbert pid=4266)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=4266)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=4266)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=4266)[0m   Batch size = 8


Trial train_distilbert_f60e0dfa reported r2=0.48 with parameters={'num_train_epochs': 9.877269710950776, 'learning_rate': 1.5592392760908312e-05, 'adam_epsilon': 3.553157092615332e-08, 'adam_beta1': 0.8, 'adam_beta2': 0.9829656060428444, 'weight_decay': 0.2943476669402031}.
Trial train_distilbert_f60e0dfa completed. Last result: r2=0.47759009833744626
[2m[36m(train_distilbert pid=4266)[0m {'eval_loss': 0.018748044967651367, 'eval_mse': 0.018748044967651367, 'eval_rmse': 0.13692350685596466, 'eval_mae': 0.10933854430913925, 'eval_r2': 0.47759009833744626, 'eval_smape': 23.371146379116766, 'eval_runtime': 4.2414, 'eval_samples_per_second': 236.244, 'eval_steps_per_second': 29.707, 'epoch': 9.88}


[2m[36m(train_distilbert pid=4416)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=4416)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=4416)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=4416)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=4416)[0m Saving model checkpoint to ./checkpoint-500
[2m[36m(train_distilbert pid=4416)[0m Configuration saved in ./checkpoint-500/config.json
[2m[36m(train_distilbert pid=4416)[0m Model weights saved in ./checkpoint-500/pytorch_model.bin
[2m[36m(train_distilbert pid=4416)[0m tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
[2m[36m(train_distilbert pid=4416)[0m Special tokens file saved in ./checkpoint-500/special_tokens_map.json


2022-12-02 05:30:33,139	INFO stopper.py:363 -- Reached timeout of 3600 seconds. Stopping all trials.


Trial name,status,loc,adam_beta1,adam_beta2,adam_epsilon,learning_rate,num_train_epochs,weight_decay,iter,total time (s),r2
train_distilbert_0c5ec8d0,TERMINATED,172.28.0.12:1110,0.868934,0.98729,1.9703e-08,1.56626e-06,1.0,0.226646,1.0,32.8451,0.0371399
train_distilbert_10a85884,TERMINATED,172.28.0.12:1205,0.802586,0.995234,3.25403e-08,3.41024e-06,1.44427,0.0299459,1.0,41.8021,0.187267
train_distilbert_24aefc66,TERMINATED,172.28.0.12:1304,0.845374,0.988751,3.16312e-08,1.48909e-06,1.0,0.175998,1.0,31.6102,0.0108226
train_distilbert_41e4cf86,TERMINATED,172.28.0.12:1399,0.892494,0.985831,1.22729e-08,1.64743e-06,1.02155,0.277294,1.0,32.2443,0.0247796
train_distilbert_59242c46,TERMINATED,172.28.0.12:1493,0.868625,0.985727,3.84131e-08,1.36325e-06,1.0,0.279604,1.0,31.6326,0.0568407
train_distilbert_706258e2,TERMINATED,172.28.0.12:1587,0.869242,0.988855,1.01061e-08,1.7995e-06,1.1163,0.173688,1.0,34.4753,0.0423844
train_distilbert_87a3b280,TERMINATED,172.28.0.12:1683,0.857986,0.986129,5.72013e-08,1e-06,2.30995,0.289247,1.0,62.0988,0.0662426
train_distilbert_a4310376,TERMINATED,172.28.0.12:1791,0.879265,0.985325,2.5796e-08,2.02689e-06,1.0,0.269961,1.0,31.7421,0.045148
train_distilbert_c9cc61a2,TERMINATED,172.28.0.12:1887,0.841587,0.985338,3.73367e-08,1.02605e-06,1.45422,0.237318,1.0,42.1019,0.0161436
train_distilbert_e10cee68,TERMINATED,172.28.0.12:1984,0.874385,0.986921,8.76346e-08,1e-06,3.66923,0.3,1.0,95.0856,0.0706021


[2m[36m(train_distilbert pid=4416)[0m Saving model checkpoint to ./checkpoint-1000
[2m[36m(train_distilbert pid=4416)[0m Configuration saved in ./checkpoint-1000/config.json
[2m[36m(train_distilbert pid=4416)[0m Model weights saved in ./checkpoint-1000/pytorch_model.bin
[2m[36m(train_distilbert pid=4416)[0m tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
[2m[36m(train_distilbert pid=4416)[0m Special tokens file saved in ./checkpoint-1000/special_tokens_map.json


[2m[36m(train_distilbert pid=4416)[0m {'train_runtime': 166.0707, 'train_samples_per_second': 71.844, 'train_steps_per_second': 9.002, 'train_loss': 0.011085732246322377, 'epoch': 7.02}


[2m[36m(train_distilbert pid=4416)[0m 
[2m[36m(train_distilbert pid=4416)[0m 
[2m[36m(train_distilbert pid=4416)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=4416)[0m 
[2m[36m(train_distilbert pid=4416)[0m 
[2m[36m(train_distilbert pid=4416)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=4416)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=4416)[0m   Num examples = 1002
[2m[36m(train_distilbert pid=4416)[0m   Batch size = 8
2022-12-02 05:31:46,869	INFO tune.py:747 -- Total run time: 3679.02 seconds (3605.12 seconds for the tuning loop).


[2m[36m(train_distilbert pid=4416)[0m {'eval_loss': 0.019225232303142548, 'eval_mse': 0.019225232303142548, 'eval_rmse': 0.13865508139133453, 'eval_mae': 0.11000686138868332, 'eval_r2': 0.4642933742508213, 'eval_smape': 23.33022626621756, 'eval_runtime': 4.2487, 'eval_samples_per_second': 235.839, 'eval_steps_per_second': 29.656, 'epoch': 7.02}


In [43]:
training_args = TrainingArguments(output_dir="test_trainer",
                                  logging_steps=20000,
                                  #per_device_train_batch_size=64,
                                  #per_device_eval_batch_size=20,
                                  learning_rate = 1.857295378811325e-05,
                                  adam_epsilon = 2.4916439803066257e-08,
                                  adam_beta1 = 0.8109062777371857,
                                  adam_beta2 = 0.9870507093459101,
                                  weight_decay = 0.2851666696787331,
                                  num_train_epochs=50,
                                  )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics_for_regression
)

In [44]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1701
  Num Epochs = 50
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10650
  Number of trainable parameters = 66954241


Step,Training Loss


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2000
Configuration saved in test_trainer/checkpoint-2000/config.json
Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2500
Configuration saved in test_trainer/checkpoint-2500/config.json
Model weights saved in test_trainer/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-30

TrainOutput(global_step=10650, training_loss=0.0027950387157744646, metrics={'train_runtime': 1224.5335, 'train_samples_per_second': 69.455, 'train_steps_per_second': 8.697, 'total_flos': 1.12661513362944e+16, 'train_loss': 0.0027950387157744646, 'epoch': 50.0})

In [45]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1002
  Batch size = 8


{'eval_loss': 0.018708493560552597,
 'eval_mse': 0.018708493560552597,
 'eval_rmse': 0.13677899539470673,
 'eval_mae': 0.10913343727588654,
 'eval_r2': 0.47869219056724877,
 'eval_smape': 23.115403957709578,
 'eval_runtime': 4.4244,
 'eval_samples_per_second': 226.471,
 'eval_steps_per_second': 28.478,
 'epoch': 50.0}

# Another Ray search and the Seventh Model

In [71]:
#New function to evaluate, now it includes the Person r 
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    r, p = scipy.stats.pearsonr(labels, predictions)
    r = r.item()
    return {"rmse": rmse, "pearsonr": r}

In [75]:
#Download the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=1)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/pytorch_model.bin
Some we

In [102]:
#Re-size to the length of the tokenizer 
model.resize_token_embeddings(len(tokenizer))

Embedding(30522, 768, padding_idx=0)

In [103]:
#Arguments for training 
args = TrainingArguments(
    output_dir='output',
    do_eval=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [104]:
#Define the trainer 
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [105]:
#Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1446
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 543
  Number of trainable parameters = 66954241


Step,Training Loss
500,0.0148


Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
tokenizer config file saved in output/checkpoint-500/tokenizer_config.json
Special tokens file saved in output/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=543, training_loss=0.014244305549625094, metrics={'train_runtime': 216.7375, 'train_samples_per_second': 20.015, 'train_steps_per_second': 2.505, 'total_flos': 574633327417344.0, 'train_loss': 0.014244305549625094, 'epoch': 3.0})

In [106]:
#Evaluate the model
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 255
  Batch size = 8


{'eval_loss': 0.01071194838732481,
 'eval_rmse': 0.10349854826927185,
 'eval_pearsonr': 0.78164069617914,
 'eval_runtime': 4.5533,
 'eval_samples_per_second': 56.003,
 'eval_steps_per_second': 7.028,
 'epoch': 3.0}

In [107]:
!pip install flaml

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [108]:
import flaml

In [109]:
MODEL_CHECKPOINT = "distilbert-base-uncased"

In [110]:
def train_distilbert(config: dict):

    

    train_dataset, eval_dataset = tokenized_train_dataset, tokenized_val_dataset

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_CHECKPOINT, num_labels=1
    )

    model.resize_token_embeddings(len(tokenizer))

    def compute_metrics(eval_pred):

      predictions, labels = eval_pred
      rmse = mean_squared_error(labels, predictions, squared=False)
      r, p = scipy.stats.pearsonr(labels, predictions)
      r = r.item()
      return {"rmse": rmse, "pearsonr": r}


    training_args = TrainingArguments(
        output_dir='.',
        do_eval=False,
        disable_tqdm=True,
        logging_steps=20000,
        save_total_limit=0,
        **config,
    )

    trainer = Trainer(
        model,
        training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # train model
    trainer.train()

    # evaluate model
    eval_output = trainer.evaluate()

    # report the metric to optimize
    flaml.tune.report(
        pearsonr=eval_output["eval_pearsonr"],
    )

In [111]:
############ Version 2 ########################
#New search space for the hyperparameters to optimize 

max_num_epoch = 64
search_space = {
        # You can mix constants with search space objects.
        "num_train_epochs": flaml.tune.loguniform(1, max_num_epoch),
        "learning_rate": flaml.tune.loguniform(1e-6, 1e-4),
        "adam_epsilon": flaml.tune.loguniform(1e-9, 1e-7),
        "adam_beta1": flaml.tune.uniform(0.8, 0.99),
        "adam_beta2": flaml.tune.loguniform(98e-2, 9999e-4),
        "weight_decay": flaml.tune.uniform(0.0, 0.3),
        
}

In [112]:
# optimization objective
HP_METRIC, MODE = "pearsonr", "max"

# resources
num_cpus = 0
num_gpus = 1

# constraints
num_samples = -1    # number of trials, -1 means unlimited
time_budget_s = 3600    # time budget in seconds

In [76]:
!pip install pickle5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [77]:
import pickle5 as pickle

In [115]:
#Another ray tuning 
import time
import ray
start_time = time.time()
ray.shutdown()
ray.init(num_cpus=num_cpus, num_gpus=num_gpus)

print("Tuning started...")
analysis = flaml.tune.run(
    train_distilbert,
    search_alg=flaml.CFO(
        space=search_space,
        metric=HP_METRIC,
        mode=MODE,
        low_cost_partial_config={"num_train_epochs": 1}),
    # uncomment the following if scheduler = 'asha',
    # max_resource=max_num_epoch, min_resource=1,
    resources_per_trial={"gpu": num_gpus, "cpu": num_cpus},
    local_dir='logs/',
    num_samples=num_samples,
    time_budget_s=time_budget_s,
    use_ray=True,
)

ray.shutdown()



Tuning started...


[2m[36m(train_distilbert pid=2464)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=2464)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=2464)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=2464)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=2464)[0m {'train_runtime': 74.6131, 'train_samples_per_second': 19.38, 'train_steps_per_second': 2.426, 'train_loss': 0.08657695707036646, 'epoch': 1.0}


[2m[36m(train_distilbert pid=2464)[0m 
[2m[36m(train_distilbert pid=2464)[0m 
[2m[36m(train_distilbert pid=2464)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=2464)[0m 
[2m[36m(train_distilbert pid=2464)[0m 
[2m[36m(train_distilbert pid=2464)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=2464)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=2464)[0m   Num examples = 255
[2m[36m(train_distilbert pid=2464)[0m   Batch size = 8


Trial train_distilbert_77cd135a reported pearsonr=0.28 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.5662610420278344e-06, 'adam_epsilon': 1.9702991167906198e-08, 'adam_beta1': 0.8689337534108345, 'adam_beta2': 0.9872898093714128, 'weight_decay': 0.22664606941097878}.
Trial train_distilbert_77cd135a completed. Last result: pearsonr=0.28123629088139435
[2m[36m(train_distilbert pid=2464)[0m {'eval_loss': 0.02464824542403221, 'eval_rmse': 0.15699759125709534, 'eval_pearsonr': 0.28123629088139435, 'eval_runtime': 4.5185, 'eval_samples_per_second': 56.435, 'eval_steps_per_second': 7.082, 'epoch': 1.0}


[2m[36m(train_distilbert pid=2578)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight']
[2m[36m(train_distilbert pid=2578)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=2578)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=2578)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=2578)[0m {'train_runtime': 104.3841, 'train_samples_per_second': 20.007, 'train_steps_per_second': 2.51, 'train_loss': 0.045996258277019475, 'epoch': 1.45}


[2m[36m(train_distilbert pid=2578)[0m 
[2m[36m(train_distilbert pid=2578)[0m 
[2m[36m(train_distilbert pid=2578)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=2578)[0m 
[2m[36m(train_distilbert pid=2578)[0m 
[2m[36m(train_distilbert pid=2578)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=2578)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=2578)[0m   Num examples = 255
[2m[36m(train_distilbert pid=2578)[0m   Batch size = 8


Trial train_distilbert_7c710ec0 reported pearsonr=0.53 with parameters={'num_train_epochs': 1.444265389543504, 'learning_rate': 3.4102391893542776e-06, 'adam_epsilon': 3.2540285965060775e-08, 'adam_beta1': 0.8025861976630991, 'adam_beta2': 0.9952338248324752, 'weight_decay': 0.029945923065410704}.
[2m[36m(train_distilbert pid=2578)[0m {'eval_loss': 0.01961401104927063, 'eval_rmse': 0.1400500237941742, 'eval_pearsonr': 0.5345884227276733, 'eval_runtime': 4.6194, 'eval_samples_per_second': 55.202, 'eval_steps_per_second': 6.927, 'epoch': 1.45}
Trial train_distilbert_7c710ec0 completed. Last result: pearsonr=0.5345884227276733


[2m[36m(train_distilbert pid=2702)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
[2m[36m(train_distilbert pid=2702)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=2702)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=2702)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=2702)[0m 
[2m[36m(train_distilbert pid=2702)[0m 
[2m[36m(train_distilbert pid=2702)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=2702)[0m 
[2m[36m(train_distilbert pid=2702)[0m 
[2m[36m(train_distilbert pid=2702)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=2702)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=2702)[0m   Num examples = 255
[2m[36m(train_distilbert pid=2702)[0m   Batch size = 8


[2m[36m(train_distilbert pid=2702)[0m {'train_runtime': 72.4458, 'train_samples_per_second': 19.96, 'train_steps_per_second': 2.498, 'train_loss': 0.06319442770099112, 'epoch': 1.0}


Trial train_distilbert_aec07e42 reported pearsonr=0.32 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.489090831567398e-06, 'adam_epsilon': 3.163121807821293e-08, 'adam_beta1': 0.84537379213597, 'adam_beta2': 0.9887505670214453, 'weight_decay': 0.175998257587662}.
Trial train_distilbert_aec07e42 completed. Last result: pearsonr=0.3235139991500585
[2m[36m(train_distilbert pid=2702)[0m {'eval_loss': 0.023653998970985413, 'eval_rmse': 0.15379856526851654, 'eval_pearsonr': 0.3235139991500585, 'eval_runtime': 4.5118, 'eval_samples_per_second': 56.518, 'eval_steps_per_second': 7.092, 'epoch': 1.0}


[2m[36m(train_distilbert pid=2814)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=2814)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=2814)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=2814)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=2814)[0m 
[2m[36m(train_distilbert pid=2814)[0m 
[2m[36m(train_distilbert pid=2814)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=2814)[0m 
[2m[36m(train_distilbert pid=2814)[0m 
[2m[36m(train_distilbert pid=2814)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=2814)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=2814)[0m   Num examples = 255
[2m[36m(train_distilbert pid=2814)[0m   Batch size = 8


[2m[36m(train_distilbert pid=2814)[0m {'train_runtime': 74.0253, 'train_samples_per_second': 19.955, 'train_steps_per_second': 2.499, 'train_loss': 0.06156680132891681, 'epoch': 1.02}


Trial train_distilbert_f648e20e reported pearsonr=0.30 with parameters={'num_train_epochs': 1.021552403070911, 'learning_rate': 1.647430499046145e-06, 'adam_epsilon': 1.227293428924196e-08, 'adam_beta1': 0.8924937146856989, 'adam_beta2': 0.9858312098115838, 'weight_decay': 0.2772938812342956}.
Trial train_distilbert_f648e20e completed. Last result: pearsonr=0.2984216115870664
[2m[36m(train_distilbert pid=2814)[0m {'eval_loss': 0.024028781801462173, 'eval_rmse': 0.15501220524311066, 'eval_pearsonr': 0.2984216115870664, 'eval_runtime': 4.5004, 'eval_samples_per_second': 56.662, 'eval_steps_per_second': 7.111, 'epoch': 1.02}


[2m[36m(train_distilbert pid=2924)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight']
[2m[36m(train_distilbert pid=2924)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=2924)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=2924)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=2924)[0m {'train_runtime': 72.383, 'train_samples_per_second': 19.977, 'train_steps_per_second': 2.501, 'train_loss': 0.10446588505697514, 'epoch': 1.0}


[2m[36m(train_distilbert pid=2924)[0m 
[2m[36m(train_distilbert pid=2924)[0m 
[2m[36m(train_distilbert pid=2924)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=2924)[0m 
[2m[36m(train_distilbert pid=2924)[0m 
[2m[36m(train_distilbert pid=2924)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=2924)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=2924)[0m   Num examples = 255
[2m[36m(train_distilbert pid=2924)[0m   Batch size = 8


Trial train_distilbert_2a2cfccc reported pearsonr=0.47 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.2960847170540239e-06, 'adam_epsilon': 6.16684821113724e-08, 'adam_beta1': 0.845065514298112, 'adam_beta2': 0.9871855315607734, 'weight_decay': 0.22895656538340642}.
Trial train_distilbert_2a2cfccc completed. Last result: pearsonr=0.47021541524068083
[2m[36m(train_distilbert pid=2924)[0m {'eval_loss': 0.02090456336736679, 'eval_rmse': 0.14458410441875458, 'eval_pearsonr': 0.47021541524068083, 'eval_runtime': 4.5041, 'eval_samples_per_second': 56.615, 'eval_steps_per_second': 7.105, 'epoch': 1.0}


[2m[36m(train_distilbert pid=3036)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=3036)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3036)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3036)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3036)[0m 
[2m[36m(train_distilbert pid=3036)[0m 
[2m[36m(train_distilbert pid=3036)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3036)[0m 
[2m[36m(train_distilbert pid=3036)[0m 
[2m[36m(train_distilbert pid=3036)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3036)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3036)[0m   Num examples = 255
[2m[36m(train_distilbert pid=3036)[0m   Batch size = 8


[2m[36m(train_distilbert pid=3036)[0m {'train_runtime': 82.1653, 'train_samples_per_second': 19.645, 'train_steps_per_second': 2.471, 'train_loss': 0.07488197646117563, 'epoch': 1.12}
Trial train_distilbert_5f35023e reported pearsonr=0.34 with parameters={'num_train_epochs': 1.1163034689325957, 'learning_rate': 1.7108384008247348e-06, 'adam_epsilon': 1.6224397339706115e-08, 'adam_beta1': 0.8456820699738279, 'adam_beta2': 0.9903180836124769, 'weight_decay': 0.12303994979191755}.
Trial train_distilbert_5f35023e completed. Last result: pearsonr=0.3384532151939916
[2m[36m(train_distilbert pid=3036)[0m {'eval_loss': 0.023356296122074127, 'eval_rmse': 0.1528276801109314, 'eval_pearsonr': 0.3384532151939916, 'eval_runtime': 4.5045, 'eval_samples_per_second': 56.61, 'eval_steps_per_second': 7.104, 'epoch': 1.12}


[2m[36m(train_distilbert pid=3144)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
[2m[36m(train_distilbert pid=3144)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3144)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3144)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3144)[0m 
[2m[36m(train_distilbert pid=3144)[0m 
[2m[36m(train_distilbert pid=3144)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3144)[0m 
[2m[36m(train_distilbert pid=3144)[0m 
[2m[36m(train_distilbert pid=3144)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3144)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3144)[0m   Num examples = 255
[2m[36m(train_distilbert pid=3144)[0m   Batch size = 8


[2m[36m(train_distilbert pid=3144)[0m {'train_runtime': 166.4994, 'train_samples_per_second': 20.061, 'train_steps_per_second': 2.517, 'train_loss': 0.03865769597966233, 'epoch': 2.31}


Trial train_distilbert_93139d68 reported pearsonr=0.43 with parameters={'num_train_epochs': 2.3099532396348588, 'learning_rate': 1e-06, 'adam_epsilon': 9.183108913152316e-08, 'adam_beta1': 0.8344261188939788, 'adam_beta2': 0.9875881996419343, 'weight_decay': 0.23859962148574662}.
Trial train_distilbert_93139d68 completed. Last result: pearsonr=0.43159999279878447
[2m[36m(train_distilbert pid=3144)[0m {'eval_loss': 0.02221551723778248, 'eval_rmse': 0.14904871582984924, 'eval_pearsonr': 0.43159999279878447, 'eval_runtime': 4.5247, 'eval_samples_per_second': 56.357, 'eval_steps_per_second': 7.072, 'epoch': 2.31}


[2m[36m(train_distilbert pid=3263)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight']
[2m[36m(train_distilbert pid=3263)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3263)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3263)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3263)[0m {'train_runtime': 72.4662, 'train_samples_per_second': 19.954, 'train_steps_per_second': 2.498, 'train_loss': 0.0717884242863945, 'epoch': 1.0}


[2m[36m(train_distilbert pid=3263)[0m 
[2m[36m(train_distilbert pid=3263)[0m 
[2m[36m(train_distilbert pid=3263)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3263)[0m 
[2m[36m(train_distilbert pid=3263)[0m 
[2m[36m(train_distilbert pid=3263)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3263)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3263)[0m   Num examples = 255
[2m[36m(train_distilbert pid=3263)[0m   Batch size = 8


Trial train_distilbert_d064b9d6 reported pearsonr=0.35 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.927019982067525e-06, 'adam_epsilon': 4.141300862144731e-08, 'adam_beta1': 0.8557049097022453, 'adam_beta2': 0.9867830276589572, 'weight_decay': 0.21931350928106622}.
[2m[36m(train_distilbert pid=3263)[0m {'eval_loss': 0.023243825882673264, 'eval_rmse': 0.1524592638015747, 'eval_pearsonr': 0.3522820000949558, 'eval_runtime': 4.5242, 'eval_samples_per_second': 56.363, 'eval_steps_per_second': 7.073, 'epoch': 1.0}
Trial train_distilbert_d064b9d6 completed. Last result: pearsonr=0.3522820000949558


[2m[36m(train_distilbert pid=3357)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=3357)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3357)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3357)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3357)[0m 
[2m[36m(train_distilbert pid=3357)[0m 
[2m[36m(train_distilbert pid=3357)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3357)[0m 
[2m[36m(train_distilbert pid=3357)[0m 
[2m[36m(train_distilbert pid=3357)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3357)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3357)[0m   Num examples = 255
[2m[36m(train_distilbert pid=3357)[0m   Batch size = 8


[2m[36m(train_distilbert pid=3357)[0m {'train_runtime': 72.4989, 'train_samples_per_second': 19.945, 'train_steps_per_second': 2.497, 'train_loss': 0.07662863230836985, 'epoch': 1.0}


Trial train_distilbert_3cde6616 reported pearsonr=0.14 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.329844035771342e-06, 'adam_epsilon': 4.025254976681223e-08, 'adam_beta1': 0.8286663944235804, 'adam_beta2': 0.9863934119178718, 'weight_decay': 0.17702737123987064}.
Trial train_distilbert_3cde6616 completed. Last result: pearsonr=0.1387559251667215
[2m[36m(train_distilbert pid=3357)[0m {'eval_loss': 0.027205392718315125, 'eval_rmse': 0.16494058072566986, 'eval_pearsonr': 0.1387559251667215, 'eval_runtime': 4.4936, 'eval_samples_per_second': 56.748, 'eval_steps_per_second': 7.121, 'epoch': 1.0}


[2m[36m(train_distilbert pid=3450)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
[2m[36m(train_distilbert pid=3450)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3450)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3450)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3450)[0m 
[2m[36m(train_distilbert pid=3450)[0m 
[2m[36m(train_distilbert pid=3450)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3450)[0m 
[2m[36m(train_distilbert pid=3450)[0m 
[2m[36m(train_distilbert pid=3450)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3450)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3450)[0m   Num examples = 255
[2m[36m(train_distilbert pid=3450)[0m   Batch size = 8


[2m[36m(train_distilbert pid=3450)[0m {'train_runtime': 114.2878, 'train_samples_per_second': 20.097, 'train_steps_per_second': 2.52, 'train_loss': 0.0934326450030009, 'epoch': 1.59}


Trial train_distilbert_6dd4014a reported pearsonr=0.42 with parameters={'num_train_epochs': 1.5884440108684583, 'learning_rate': 1.2631824098129397e-06, 'adam_epsilon': 9.447852888703702e-08, 'adam_beta1': 0.8614646341726437, 'adam_beta2': 0.9879782873124741, 'weight_decay': 0.2808857595269422}.
Trial train_distilbert_6dd4014a completed. Last result: pearsonr=0.422607465547113
[2m[36m(train_distilbert pid=3450)[0m {'eval_loss': 0.022193673998117447, 'eval_rmse': 0.1489754170179367, 'eval_pearsonr': 0.422607465547113, 'eval_runtime': 4.6263, 'eval_samples_per_second': 55.12, 'eval_steps_per_second': 6.917, 'epoch': 1.59}


[2m[36m(train_distilbert pid=3556)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=3556)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3556)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3556)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3556)[0m {'train_runtime': 72.387, 'train_samples_per_second': 19.976, 'train_steps_per_second': 2.5, 'train_loss': 0.05446391342753205, 'epoch': 1.0}


[2m[36m(train_distilbert pid=3556)[0m 
[2m[36m(train_distilbert pid=3556)[0m 
[2m[36m(train_distilbert pid=3556)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3556)[0m 
[2m[36m(train_distilbert pid=3556)[0m 
[2m[36m(train_distilbert pid=3556)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3556)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3556)[0m   Num examples = 255
[2m[36m(train_distilbert pid=3556)[0m   Batch size = 8


Trial train_distilbert_a24cd302 reported pearsonr=0.33 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.4753028820110493e-06, 'adam_epsilon': 8.838341260843817e-08, 'adam_beta1': 0.8647087175730349, 'adam_beta2': 0.9860689265555309, 'weight_decay': 0.2543324173889758}.
Trial train_distilbert_a24cd302 completed. Last result: pearsonr=0.3293501737761485
[2m[36m(train_distilbert pid=3556)[0m {'eval_loss': 0.023598676547408104, 'eval_rmse': 0.15361860394477844, 'eval_pearsonr': 0.3293501737761485, 'eval_runtime': 4.5317, 'eval_samples_per_second': 56.271, 'eval_steps_per_second': 7.061, 'epoch': 1.0}


[2m[36m(train_distilbert pid=3650)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=3650)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3650)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3650)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3650)[0m 
[2m[36m(train_distilbert pid=3650)[0m 
[2m[36m(train_distilbert pid=3650)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3650)[0m 
[2m[36m(train_distilbert pid=3650)[0m 
[2m[36m(train_distilbert pid=3650)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3650)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3650)[0m   Num examples = 255
[2m[36m(train_distilbert pid=3650)[0m   Batch size = 8


[2m[36m(train_distilbert pid=3650)[0m {'train_runtime': 151.8529, 'train_samples_per_second': 20.058, 'train_steps_per_second': 2.516, 'train_loss': 0.09047226631204495, 'epoch': 2.11}


Trial train_distilbert_efcb85f6 reported pearsonr=0.38 with parameters={'num_train_epochs': 2.1064177423613923, 'learning_rate': 1.1386377768686742e-06, 'adam_epsilon': 4.302845492930858e-08, 'adam_beta1': 0.8254223110231892, 'adam_beta2': 0.988303400987502, 'weight_decay': 0.2035807133778371}.
Trial train_distilbert_efcb85f6 completed. Last result: pearsonr=0.3818116658084686
[2m[36m(train_distilbert pid=3650)[0m {'eval_loss': 0.022539736703038216, 'eval_rmse': 0.15013238787651062, 'eval_pearsonr': 0.3818116658084686, 'eval_runtime': 4.5681, 'eval_samples_per_second': 55.822, 'eval_steps_per_second': 7.005, 'epoch': 2.11}


[2m[36m(train_distilbert pid=3775)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=3775)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3775)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3775)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3775)[0m {'train_runtime': 118.0231, 'train_samples_per_second': 20.045, 'train_steps_per_second': 2.516, 'train_loss': 0.04894487143365622, 'epoch': 1.64}


[2m[36m(train_distilbert pid=3775)[0m 
[2m[36m(train_distilbert pid=3775)[0m 
[2m[36m(train_distilbert pid=3775)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3775)[0m 
[2m[36m(train_distilbert pid=3775)[0m 
[2m[36m(train_distilbert pid=3775)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3775)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3775)[0m   Num examples = 255
[2m[36m(train_distilbert pid=3775)[0m   Batch size = 8


Trial train_distilbert_23b87ee6 reported pearsonr=0.48 with parameters={'num_train_epochs': 1.6360565216175198, 'learning_rate': 1.952487904242541e-06, 'adam_epsilon': 3.811483295069611e-08, 'adam_beta1': 0.8200010004033704, 'adam_beta2': 0.9877890951717274, 'weight_decay': 0.2571429881233818}.
[2m[36m(train_distilbert pid=3775)[0m {'eval_loss': 0.021335460245609283, 'eval_rmse': 0.14606663584709167, 'eval_pearsonr': 0.4842928360148923, 'eval_runtime': 4.6418, 'eval_samples_per_second': 54.935, 'eval_steps_per_second': 6.894, 'epoch': 1.64}
Trial train_distilbert_23b87ee6 completed. Last result: pearsonr=0.4842928360148923


[2m[36m(train_distilbert pid=3884)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias']
[2m[36m(train_distilbert pid=3884)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3884)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3884)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3884)[0m 
[2m[36m(train_distilbert pid=3884)[0m 
[2m[36m(train_distilbert pid=3884)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3884)[0m 
[2m[36m(train_distilbert pid=3884)[0m 
[2m[36m(train_distilbert pid=3884)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3884)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3884)[0m   Num examples = 255
[2m[36m(train_distilbert pid=3884)[0m   Batch size = 8


[2m[36m(train_distilbert pid=3884)[0m {'train_runtime': 72.2336, 'train_samples_per_second': 20.018, 'train_steps_per_second': 2.506, 'train_loss': 0.11886192026717887, 'epoch': 1.0}
Trial train_distilbert_8aa9b2c8 reported pearsonr=0.43 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1e-06, 'adam_epsilon': 9.977747222033173e-08, 'adam_beta1': 0.8701300281928537, 'adam_beta2': 0.9865823367421399, 'weight_decay': 0.200770142643431}.


Trial train_distilbert_8aa9b2c8 completed. Last result: pearsonr=0.4262386209783994
[2m[36m(train_distilbert pid=3884)[0m {'eval_loss': 0.03085251711308956, 'eval_rmse': 0.17564885318279266, 'eval_pearsonr': 0.4262386209783994, 'eval_runtime': 4.5343, 'eval_samples_per_second': 56.238, 'eval_steps_per_second': 7.057, 'epoch': 1.0}


[2m[36m(train_distilbert pid=3977)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=3977)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=3977)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=3977)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=3977)[0m {'train_runtime': 115.2499, 'train_samples_per_second': 20.068, 'train_steps_per_second': 2.516, 'train_loss': 0.0332249509877172, 'epoch': 1.6}


[2m[36m(train_distilbert pid=3977)[0m 
[2m[36m(train_distilbert pid=3977)[0m 
[2m[36m(train_distilbert pid=3977)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=3977)[0m 
[2m[36m(train_distilbert pid=3977)[0m 
[2m[36m(train_distilbert pid=3977)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=3977)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=3977)[0m   Num examples = 255
[2m[36m(train_distilbert pid=3977)[0m   Batch size = 8


Trial train_distilbert_d67f8ce0 reported pearsonr=0.67 with parameters={'num_train_epochs': 1.5995041979393643, 'learning_rate': 5.306666363737239e-06, 'adam_epsilon': 4.929038758292255e-08, 'adam_beta1': 0.8297058167298363, 'adam_beta2': 0.9891327920799797, 'weight_decay': 0.27223774197978906}.
Trial train_distilbert_d67f8ce0 completed. Last result: pearsonr=0.6650237295257004
[2m[36m(train_distilbert pid=3977)[0m {'eval_loss': 0.015171333216130733, 'eval_rmse': 0.12317196279764175, 'eval_pearsonr': 0.6650237295257004, 'eval_runtime': 4.6191, 'eval_samples_per_second': 55.206, 'eval_steps_per_second': 6.928, 'epoch': 1.6}


[2m[36m(train_distilbert pid=4080)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias']
[2m[36m(train_distilbert pid=4080)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=4080)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=4080)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=4080)[0m 
[2m[36m(train_distilbert pid=4080)[0m 
[2m[36m(train_distilbert pid=4080)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=4080)[0m 
[2m[36m(train_distilbert pid=4080)[0m 
[2m[36m(train_distilbert pid=4080)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=4080)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=4080)[0m   Num examples = 255
[2m[36m(train_distilbert pid=4080)[0m   Batch size = 8


[2m[36m(train_distilbert pid=4080)[0m {'train_runtime': 120.4297, 'train_samples_per_second': 20.093, 'train_steps_per_second': 2.516, 'train_loss': 0.09402207100745474, 'epoch': 1.67}


Trial train_distilbert_0a60b098 reported pearsonr=0.39 with parameters={'num_train_epochs': 1.67344414936552, 'learning_rate': 1e-06, 'adam_epsilon': 2.9473099362740586e-08, 'adam_beta1': 0.8102961840769045, 'adam_beta2': 0.9864472236214005, 'weight_decay': 0.24204823426697455}.
Trial train_distilbert_0a60b098 completed. Last result: pearsonr=0.38821420879911767
[2m[36m(train_distilbert pid=4080)[0m {'eval_loss': 0.022595327347517014, 'eval_rmse': 0.15031741559505463, 'eval_pearsonr': 0.38821420879911767, 'eval_runtime': 4.6302, 'eval_samples_per_second': 55.073, 'eval_steps_per_second': 6.911, 'epoch': 1.67}


[2m[36m(train_distilbert pid=4189)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']
[2m[36m(train_distilbert pid=4189)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=4189)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=4189)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=4189)[0m 
[2m[36m(train_distilbert pid=4189)[0m 
[2m[36m(train_distilbert pid=4189)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=4189)[0m 
[2m[36m(train_distilbert pid=4189)[0m 
[2m[36m(train_distilbert pid=4189)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=4189)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=4189)[0m   Num examples = 255
[2m[36m(train_distilbert pid=4189)[0m   Batch size = 8


[2m[36m(train_distilbert pid=4189)[0m {'train_runtime': 96.6636, 'train_samples_per_second': 20.038, 'train_steps_per_second': 2.514, 'train_loss': 0.02443584395043644, 'epoch': 1.34}


Trial train_distilbert_57e3c38c reported pearsonr=0.72 with parameters={'num_train_epochs': 1.3395003702939983, 'learning_rate': 1.2496195873464926e-05, 'adam_epsilon': 7.554357299684655e-08, 'adam_beta1': 0.8190240502775854, 'adam_beta2': 0.9885614559149825, 'weight_decay': 0.24075999142823515}.
Trial train_distilbert_57e3c38c completed. Last result: pearsonr=0.7239499130115219
[2m[36m(train_distilbert pid=4189)[0m {'eval_loss': 0.013085578568279743, 'eval_rmse': 0.11439221352338791, 'eval_pearsonr': 0.7239499130115219, 'eval_runtime': 4.5707, 'eval_samples_per_second': 55.79, 'eval_steps_per_second': 7.001, 'epoch': 1.34}


[2m[36m(train_distilbert pid=4294)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight']
[2m[36m(train_distilbert pid=4294)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=4294)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=4294)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=4294)[0m 
[2m[36m(train_distilbert pid=4294)[0m 
[2m[36m(train_distilbert pid=4294)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=4294)[0m 
[2m[36m(train_distilbert pid=4294)[0m 
[2m[36m(train_distilbert pid=4294)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=4294)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=4294)[0m   Num examples = 255
[2m[36m(train_distilbert pid=4294)[0m   Batch size = 8


[2m[36m(train_distilbert pid=4294)[0m {'train_runtime': 137.4121, 'train_samples_per_second': 20.099, 'train_steps_per_second': 2.518, 'train_loss': 0.03570986069695798, 'epoch': 1.91}
Trial train_distilbert_a8ef9558 reported pearsonr=0.55 with parameters={'num_train_epochs': 1.909976089565484, 'learning_rate': 2.253542452532944e-06, 'adam_epsilon': 3.2160807487569364e-08, 'adam_beta1': 0.8403875831820873, 'adam_beta2': 0.9897044584470212, 'weight_decay': 0.3}.
Trial train_distilbert_a8ef9558 completed. Last result: pearsonr=0.5496257820968428
[2m[36m(train_distilbert pid=4294)[0m {'eval_loss': 0.019142715260386467, 'eval_rmse': 0.13835720717906952, 'eval_pearsonr': 0.5496257820968428, 'eval_runtime': 4.5567, 'eval_samples_per_second': 55.962, 'eval_steps_per_second': 7.023, 'epoch': 1.91}


[2m[36m(train_distilbert pid=4433)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
[2m[36m(train_distilbert pid=4433)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=4433)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=4433)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=4433)[0m {'train_runtime': 125.2445, 'train_samples_per_second': 20.029, 'train_steps_per_second': 2.515, 'train_loss': 0.027025349934895835, 'epoch': 1.74}


[2m[36m(train_distilbert pid=4433)[0m 
[2m[36m(train_distilbert pid=4433)[0m 
[2m[36m(train_distilbert pid=4433)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=4433)[0m 
[2m[36m(train_distilbert pid=4433)[0m 
[2m[36m(train_distilbert pid=4433)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=4433)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=4433)[0m   Num examples = 255
[2m[36m(train_distilbert pid=4433)[0m   Batch size = 8


Trial train_distilbert_ef2a9ca2 reported pearsonr=0.71 with parameters={'num_train_epochs': 1.7348237773251656, 'learning_rate': 5.336278278648773e-06, 'adam_epsilon': 6.193225295901875e-08, 'adam_beta1': 0.8374040432358717, 'adam_beta2': 0.9900036795760274, 'weight_decay': 0.263039453018149}.
Trial train_distilbert_ef2a9ca2 completed. Last result: pearsonr=0.711584547363269
[2m[36m(train_distilbert pid=4433)[0m {'eval_loss': 0.013240017928183079, 'eval_rmse': 0.11506526917219162, 'eval_pearsonr': 0.711584547363269, 'eval_runtime': 4.6329, 'eval_samples_per_second': 55.042, 'eval_steps_per_second': 6.907, 'epoch': 1.74}


[2m[36m(train_distilbert pid=4564)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
[2m[36m(train_distilbert pid=4564)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=4564)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=4564)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=4564)[0m {'train_runtime': 75.1289, 'train_samples_per_second': 19.906, 'train_steps_per_second': 2.502, 'train_loss': 0.023834317288500197, 'epoch': 1.04}


[2m[36m(train_distilbert pid=4564)[0m 
[2m[36m(train_distilbert pid=4564)[0m 
[2m[36m(train_distilbert pid=4564)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=4564)[0m 
[2m[36m(train_distilbert pid=4564)[0m 
[2m[36m(train_distilbert pid=4564)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=4564)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=4564)[0m   Num examples = 255
[2m[36m(train_distilbert pid=4564)[0m   Batch size = 8


Trial train_distilbert_49e8bce6 reported pearsonr=0.77 with parameters={'num_train_epochs': 1.034261384625611, 'learning_rate': 2.926288756956328e-05, 'adam_epsilon': 9.21463558722167e-08, 'adam_beta1': 0.8006440573192991, 'adam_beta2': 0.98712133326541, 'weight_decay': 0.2184805298383213}.
Trial train_distilbert_49e8bce6 completed. Last result: pearsonr=0.769228349014567
[2m[36m(train_distilbert pid=4564)[0m {'eval_loss': 0.010970080271363258, 'eval_rmse': 0.10473815351724625, 'eval_pearsonr': 0.769228349014567, 'eval_runtime': 4.529, 'eval_samples_per_second': 56.304, 'eval_steps_per_second': 7.066, 'epoch': 1.04}


[2m[36m(train_distilbert pid=4675)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=4675)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=4675)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=4675)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=4675)[0m 
[2m[36m(train_distilbert pid=4675)[0m 
[2m[36m(train_distilbert pid=4675)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=4675)[0m 
[2m[36m(train_distilbert pid=4675)[0m 
[2m[36m(train_distilbert pid=4675)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=4675)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=4675)[0m   Num examples = 255
[2m[36m(train_distilbert pid=4675)[0m   Batch size = 8


[2m[36m(train_distilbert pid=4675)[0m {'train_runtime': 72.4727, 'train_samples_per_second': 19.952, 'train_steps_per_second': 2.497, 'train_loss': 0.02447471565963155, 'epoch': 1.0}


Trial train_distilbert_9a5b482e reported pearsonr=0.76 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.7460249528992767e-05, 'adam_epsilon': 7.123054866563196e-08, 'adam_beta1': 0.842931967499984, 'adam_beta2': 0.9880397617146311, 'weight_decay': 0.2886236049635651}.
[2m[36m(train_distilbert pid=4675)[0m {'eval_loss': 0.0113780302926898, 'eval_rmse': 0.10666785389184952, 'eval_pearsonr': 0.7559945398300238, 'eval_runtime': 4.4925, 'eval_samples_per_second': 56.761, 'eval_steps_per_second': 7.123, 'epoch': 1.0}
Trial train_distilbert_9a5b482e completed. Last result: pearsonr=0.7559945398300238


[2m[36m(train_distilbert pid=4786)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
[2m[36m(train_distilbert pid=4786)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=4786)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=4786)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=4786)[0m 
[2m[36m(train_distilbert pid=4786)[0m 
[2m[36m(train_distilbert pid=4786)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=4786)[0m 
[2m[36m(train_distilbert pid=4786)[0m 
[2m[36m(train_distilbert pid=4786)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=4786)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=4786)[0m   Num examples = 255
[2m[36m(train_distilbert pid=4786)[0m   Batch size = 8


[2m[36m(train_distilbert pid=4786)[0m {'train_runtime': 119.1176, 'train_samples_per_second': 20.018, 'train_steps_per_second': 2.51, 'train_loss': 0.020694842705359824, 'epoch': 1.65}


Trial train_distilbert_d0335eb4 reported pearsonr=0.77 with parameters={'num_train_epochs': 1.649069568631859, 'learning_rate': 2.094327313508555e-05, 'adam_epsilon': 9.772583661965892e-08, 'adam_beta1': 0.8, 'adam_beta2': 0.9876425425269827, 'weight_decay': 0.1706169163029913}.
Trial train_distilbert_d0335eb4 completed. Last result: pearsonr=0.7678165687407554
[2m[36m(train_distilbert pid=4786)[0m {'eval_loss': 0.011210951954126358, 'eval_rmse': 0.10588178783655167, 'eval_pearsonr': 0.7678165687407554, 'eval_runtime': 4.634, 'eval_samples_per_second': 55.028, 'eval_steps_per_second': 6.905, 'epoch': 1.65}


[2m[36m(train_distilbert pid=4903)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight']
[2m[36m(train_distilbert pid=4903)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=4903)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=4903)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=4903)[0m 
[2m[36m(train_distilbert pid=4903)[0m 
[2m[36m(train_distilbert pid=4903)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=4903)[0m 
[2m[36m(train_distilbert pid=4903)[0m 
[2m[36m(train_distilbert pid=4903)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=4903)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=4903)[0m   Num examples = 255
[2m[36m(train_distilbert pid=4903)[0m   Batch size = 8


[2m[36m(train_distilbert pid=4903)[0m {'train_runtime': 99.201, 'train_samples_per_second': 20.049, 'train_steps_per_second': 2.51, 'train_loss': 0.020396039189105052, 'epoch': 1.38}


Trial train_distilbert_051bcff8 reported pearsonr=0.78 with parameters={'num_train_epochs': 1.3754030919162836, 'learning_rate': 5.166521304410478e-05, 'adam_epsilon': 6.837013226375621e-08, 'adam_beta1': 0.8, 'adam_beta2': 0.9858580464571414, 'weight_decay': 0.2680545279738851}.
Trial train_distilbert_051bcff8 completed. Last result: pearsonr=0.7775081354838481
[2m[36m(train_distilbert pid=4903)[0m {'eval_loss': 0.01110247615724802, 'eval_rmse': 0.10536829382181168, 'eval_pearsonr': 0.7775081354838481, 'eval_runtime': 4.5517, 'eval_samples_per_second': 56.023, 'eval_steps_per_second': 7.03, 'epoch': 1.38}


[2m[36m(train_distilbert pid=5005)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight']
[2m[36m(train_distilbert pid=5005)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=5005)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=5005)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=5005)[0m {'train_runtime': 72.3069, 'train_samples_per_second': 19.998, 'train_steps_per_second': 2.503, 'train_loss': 0.026739167903668315, 'epoch': 1.0}


[2m[36m(train_distilbert pid=5005)[0m 
[2m[36m(train_distilbert pid=5005)[0m 
[2m[36m(train_distilbert pid=5005)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=5005)[0m 
[2m[36m(train_distilbert pid=5005)[0m 
[2m[36m(train_distilbert pid=5005)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=5005)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=5005)[0m   Num examples = 255
[2m[36m(train_distilbert pid=5005)[0m   Batch size = 8


Trial train_distilbert_558ccc12 reported pearsonr=0.74 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.6574335775561263e-05, 'adam_epsilon': 1e-07, 'adam_beta1': 0.813389860590916, 'adam_beta2': 0.98838623886004, 'weight_decay': 0.16890653170275743}.
Trial train_distilbert_558ccc12 completed. Last result: pearsonr=0.7422870908361543
[2m[36m(train_distilbert pid=5005)[0m {'eval_loss': 0.011978885158896446, 'eval_rmse': 0.10944809764623642, 'eval_pearsonr': 0.7422870908361543, 'eval_runtime': 4.5227, 'eval_samples_per_second': 56.382, 'eval_steps_per_second': 7.075, 'epoch': 1.0}


[2m[36m(train_distilbert pid=5101)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
[2m[36m(train_distilbert pid=5101)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=5101)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=5101)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=5101)[0m {'train_runtime': 72.4146, 'train_samples_per_second': 19.968, 'train_steps_per_second': 2.499, 'train_loss': 0.023890339867186153, 'epoch': 1.0}


[2m[36m(train_distilbert pid=5101)[0m 
[2m[36m(train_distilbert pid=5101)[0m 
[2m[36m(train_distilbert pid=5101)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=5101)[0m 
[2m[36m(train_distilbert pid=5101)[0m 
[2m[36m(train_distilbert pid=5101)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=5101)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=5101)[0m   Num examples = 255
[2m[36m(train_distilbert pid=5101)[0m   Batch size = 8


Trial train_distilbert_9989e1f2 reported pearsonr=0.77 with parameters={'num_train_epochs': 1.0, 'learning_rate': 4.708971275278825e-05, 'adam_epsilon': 1e-07, 'adam_beta1': 0.8169934285080104, 'adam_beta2': 0.984971967370353, 'weight_decay': 0.282887858295152}.
Trial train_distilbert_9989e1f2 completed. Last result: pearsonr=0.7663939233856759
[2m[36m(train_distilbert pid=5101)[0m {'eval_loss': 0.011041136458516121, 'eval_rmse': 0.10507681220769882, 'eval_pearsonr': 0.7663939233856759, 'eval_runtime': 4.504, 'eval_samples_per_second': 56.617, 'eval_steps_per_second': 7.105, 'epoch': 1.0}


[2m[36m(train_distilbert pid=5194)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight']
[2m[36m(train_distilbert pid=5194)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=5194)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=5194)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=5194)[0m 
[2m[36m(train_distilbert pid=5194)[0m 
[2m[36m(train_distilbert pid=5194)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=5194)[0m 
[2m[36m(train_distilbert pid=5194)[0m 
[2m[36m(train_distilbert pid=5194)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=5194)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=5194)[0m   Num examples = 255
[2m[36m(train_distilbert pid=5194)[0m   Batch size = 8


[2m[36m(train_distilbert pid=5194)[0m {'train_runtime': 136.6093, 'train_samples_per_second': 20.108, 'train_steps_per_second': 2.518, 'train_loss': 0.01683729609777761, 'epoch': 1.9}


Trial train_distilbert_cd60ed86 reported pearsonr=0.78 with parameters={'num_train_epochs': 1.89972173657516, 'learning_rate': 5.6685294576036236e-05, 'adam_epsilon': 2.6872538327031245e-08, 'adam_beta1': 0.8, 'adam_beta2': 0.9867449226591515, 'weight_decay': 0.25322119765261825}.
Trial train_distilbert_cd60ed86 completed. Last result: pearsonr=0.7762710271798229
[2m[36m(train_distilbert pid=5194)[0m {'eval_loss': 0.010740252211689949, 'eval_rmse': 0.10363517701625824, 'eval_pearsonr': 0.7762710271798229, 'eval_runtime': 4.6141, 'eval_samples_per_second': 55.265, 'eval_steps_per_second': 6.935, 'epoch': 1.9}


[2m[36m(train_distilbert pid=5303)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
[2m[36m(train_distilbert pid=5303)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=5303)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=5303)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=5303)[0m {'train_runtime': 73.9621, 'train_samples_per_second': 19.877, 'train_steps_per_second': 2.501, 'train_loss': 0.021796138866527662, 'epoch': 1.02}


[2m[36m(train_distilbert pid=5303)[0m 
[2m[36m(train_distilbert pid=5303)[0m 
[2m[36m(train_distilbert pid=5303)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=5303)[0m 
[2m[36m(train_distilbert pid=5303)[0m 
[2m[36m(train_distilbert pid=5303)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=5303)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=5303)[0m   Num examples = 255
[2m[36m(train_distilbert pid=5303)[0m   Batch size = 8


Trial train_distilbert_01d2380e reported pearsonr=0.76 with parameters={'num_train_epochs': 1.0167057677026219, 'learning_rate': 3.606369079694055e-05, 'adam_epsilon': 7.582273616553006e-08, 'adam_beta1': 0.8270778222660508, 'adam_beta2': 0.9879216596692822, 'weight_decay': 0.3}.
Trial train_distilbert_01d2380e completed. Last result: pearsonr=0.7646660660824998
[2m[36m(train_distilbert pid=5303)[0m {'eval_loss': 0.011192994192242622, 'eval_rmse': 0.1057969480752945, 'eval_pearsonr': 0.7646660660824998, 'eval_runtime': 4.5284, 'eval_samples_per_second': 56.311, 'eval_steps_per_second': 7.067, 'epoch': 1.02}


[2m[36m(train_distilbert pid=5400)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=5400)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=5400)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=5400)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=5400)[0m 
[2m[36m(train_distilbert pid=5400)[0m 
[2m[36m(train_distilbert pid=5400)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=5400)[0m 
[2m[36m(train_distilbert pid=5400)[0m 
[2m[36m(train_distilbert pid=5400)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=5400)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=5400)[0m   Num examples = 255
[2m[36m(train_distilbert pid=5400)[0m   Batch size = 8


[2m[36m(train_distilbert pid=5400)[0m {'train_runtime': 133.852, 'train_samples_per_second': 20.101, 'train_steps_per_second': 2.518, 'train_loss': 0.018066085057018066, 'epoch': 1.86}


Trial train_distilbert_5fdd242c reported pearsonr=0.77 with parameters={'num_train_epochs': 1.8606500772858698, 'learning_rate': 7.401611371177743e-05, 'adam_epsilon': 6.165004353784579e-08, 'adam_beta1': 0.8, 'adam_beta2': 0.9837987438089482, 'weight_decay': 0.22894862765895652}.
Trial train_distilbert_5fdd242c completed. Last result: pearsonr=0.7728272061027256
[2m[36m(train_distilbert pid=5400)[0m {'eval_loss': 0.011084098368883133, 'eval_rmse': 0.1052810400724411, 'eval_pearsonr': 0.7728272061027256, 'eval_runtime': 4.6393, 'eval_samples_per_second': 54.965, 'eval_steps_per_second': 6.898, 'epoch': 1.86}


[2m[36m(train_distilbert pid=5509)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight']
[2m[36m(train_distilbert pid=5509)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=5509)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=5509)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=5509)[0m {'train_runtime': 117.5538, 'train_samples_per_second': 20.077, 'train_steps_per_second': 2.518, 'train_loss': 0.01932034782461218, 'epoch': 1.64}


[2m[36m(train_distilbert pid=5509)[0m 
[2m[36m(train_distilbert pid=5509)[0m 
[2m[36m(train_distilbert pid=5509)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=5509)[0m 
[2m[36m(train_distilbert pid=5509)[0m 
[2m[36m(train_distilbert pid=5509)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=5509)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=5509)[0m   Num examples = 255
[2m[36m(train_distilbert pid=5509)[0m   Batch size = 8


Trial train_distilbert_917eab2c reported pearsonr=0.77 with parameters={'num_train_epochs': 1.6321469993584028, 'learning_rate': 4.337405364004413e-05, 'adam_epsilon': 9.749780558381314e-08, 'adam_beta1': 0.8, 'adam_beta2': 0.9817778792237832, 'weight_decay': 0.2772355252353551}.
Trial train_distilbert_917eab2c completed. Last result: pearsonr=0.7686267867776893
[2m[36m(train_distilbert pid=5509)[0m {'eval_loss': 0.011148563586175442, 'eval_rmse': 0.10558675229549408, 'eval_pearsonr': 0.7686267867776893, 'eval_runtime': 4.6131, 'eval_samples_per_second': 55.277, 'eval_steps_per_second': 6.937, 'epoch': 1.64}


[2m[36m(train_distilbert pid=5618)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
[2m[36m(train_distilbert pid=5618)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=5618)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=5618)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=5618)[0m 
[2m[36m(train_distilbert pid=5618)[0m 
[2m[36m(train_distilbert pid=5618)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=5618)[0m 
[2m[36m(train_distilbert pid=5618)[0m 
[2m[36m(train_distilbert pid=5618)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=5618)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=5618)[0m   Num examples = 255
[2m[36m(train_distilbert pid=5618)[0m   Batch size = 8


[2m[36m(train_distilbert pid=5618)[0m {'train_runtime': 83.9589, 'train_samples_per_second': 19.962, 'train_steps_per_second': 2.501, 'train_loss': 0.020289327984764464, 'epoch': 1.16}


Trial train_distilbert_ea528200 reported pearsonr=0.77 with parameters={'num_train_epochs': 1.159046131259325, 'learning_rate': 6.154126752931309e-05, 'adam_epsilon': 4.7944412264183225e-08, 'adam_beta1': 0.8163736788862225, 'adam_beta2': 0.9899551704431464, 'weight_decay': 0.25887353071241515}.
Trial train_distilbert_ea528200 completed. Last result: pearsonr=0.769096967680104
[2m[36m(train_distilbert pid=5618)[0m {'eval_loss': 0.011334525421261787, 'eval_rmse': 0.106463722884655, 'eval_pearsonr': 0.769096967680104, 'eval_runtime': 4.5362, 'eval_samples_per_second': 56.214, 'eval_steps_per_second': 7.054, 'epoch': 1.16}


[2m[36m(train_distilbert pid=5713)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
[2m[36m(train_distilbert pid=5713)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=5713)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=5713)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=5713)[0m {'train_runtime': 153.2092, 'train_samples_per_second': 20.111, 'train_steps_per_second': 2.519, 'train_loss': 0.016399888794656863, 'epoch': 2.13}


[2m[36m(train_distilbert pid=5713)[0m 
[2m[36m(train_distilbert pid=5713)[0m 
[2m[36m(train_distilbert pid=5713)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[2m[36m(train_distilbert pid=5713)[0m 
[2m[36m(train_distilbert pid=5713)[0m 
[2m[36m(train_distilbert pid=5713)[0m The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
[2m[36m(train_distilbert pid=5713)[0m ***** Running Evaluation *****
[2m[36m(train_distilbert pid=5713)[0m   Num examples = 255
[2m[36m(train_distilbert pid=5713)[0m   Batch size = 8


Trial train_distilbert_39980b5a reported pearsonr=0.78 with parameters={'num_train_epochs': 2.130842500279904, 'learning_rate': 7.191707204279362e-05, 'adam_epsilon': 1e-07, 'adam_beta1': 0.8148103444617636, 'adam_beta2': 0.986557241529914, 'weight_decay': 0.23585877049061496}.
Trial train_distilbert_39980b5a completed. Last result: pearsonr=0.780701630425316
[2m[36m(train_distilbert pid=5713)[0m {'eval_loss': 0.011654267087578773, 'eval_rmse': 0.10795493423938751, 'eval_pearsonr': 0.780701630425316, 'eval_runtime': 4.5492, 'eval_samples_per_second': 56.054, 'eval_steps_per_second': 7.034, 'epoch': 2.13}


2022-12-04 03:36:39,158	INFO stopper.py:363 -- Reached timeout of 3600 seconds. Stopping all trials.


Trial name,status,loc,adam_beta1,adam_beta2,adam_epsilon,learning_rate,num_train_epochs,weight_decay,iter,total time (s),pearsonr
train_distilbert_77cd135a,TERMINATED,172.28.0.12:2464,0.868934,0.98729,1.9703e-08,1.56626e-06,1.0,0.226646,1.0,82.8108,0.281236
train_distilbert_7c710ec0,TERMINATED,172.28.0.12:2578,0.802586,0.995234,3.25403e-08,3.41024e-06,1.44427,0.0299459,1.0,112.5,0.534588
train_distilbert_aec07e42,TERMINATED,172.28.0.12:2702,0.845374,0.988751,3.16312e-08,1.48909e-06,1.0,0.175998,1.0,80.4575,0.323514
train_distilbert_f648e20e,TERMINATED,172.28.0.12:2814,0.892494,0.985831,1.22729e-08,1.64743e-06,1.02155,0.277294,1.0,81.9792,0.298422
train_distilbert_2a2cfccc,TERMINATED,172.28.0.12:2924,0.845066,0.987186,6.16685e-08,1.29608e-06,1.0,0.228957,1.0,80.3606,0.470215
train_distilbert_5f35023e,TERMINATED,172.28.0.12:3036,0.845682,0.990318,1.62244e-08,1.71084e-06,1.1163,0.12304,1.0,90.1372,0.338453
train_distilbert_93139d68,TERMINATED,172.28.0.12:3144,0.834426,0.987588,9.18311e-08,1e-06,2.30995,0.2386,1.0,174.53,0.4316
train_distilbert_d064b9d6,TERMINATED,172.28.0.12:3263,0.855705,0.986783,4.1413e-08,1.92702e-06,1.0,0.219314,1.0,81.1941,0.352282
train_distilbert_3cde6616,TERMINATED,172.28.0.12:3357,0.828666,0.986393,4.02525e-08,1.32984e-06,1.0,0.177027,1.0,80.4917,0.138756
train_distilbert_6dd4014a,TERMINATED,172.28.0.12:3450,0.861465,0.987978,9.44785e-08,1.26318e-06,1.58844,0.280886,1.0,122.462,0.422607


[2m[36m(train_distilbert pid=5828)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
[2m[36m(train_distilbert pid=5828)[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_distilbert pid=5828)[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_distilbert pid=5828)[0m Some weights of DistilBertForSequenceClassifi

[2m[36m(train_distilbert pid=5828)[0m {'train_runtime': 73.5021, 'train_samples_per_second': 19.673, 'train_steps_per_second': 2.463, 'train_loss': 0.022954408635092044, 'epoch': 1.0}


2022-12-04 03:38:02,516	INFO tune.py:747 -- Total run time: 3688.79 seconds (3606.01 seconds for the tuning loop).


[2m[36m(train_distilbert pid=5828)[0m {'eval_loss': 0.011115641333162785, 'eval_rmse': 0.10543074458837509, 'eval_pearsonr': 0.7644832697747332, 'eval_runtime': 4.5028, 'eval_samples_per_second': 56.631, 'eval_steps_per_second': 7.107, 'epoch': 1.0}


In [78]:
#Apply the best hyperparameters 
training_args = TrainingArguments(output_dir="test_trainer",
                                  logging_steps=20000,
                                  #per_device_train_batch_size=64,
                                  #per_device_eval_batch_size=20,
                                  learning_rate = 7.191707204279362e-05,
                                  adam_epsilon = 1e-07,
                                  adam_beta1 = 0.8148103444617636,
                                  adam_beta2 = 0.986557241529914,
                                  weight_decay = 0.23585877049061496,
                                  num_train_epochs=3,
                                  )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [79]:
#Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1446
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 543
  Number of trainable parameters = 66954241


Step,Training Loss


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=543, training_loss=0.0134967598467242, metrics={'train_runtime': 212.8491, 'train_samples_per_second': 20.381, 'train_steps_per_second': 2.551, 'total_flos': 574633327417344.0, 'train_loss': 0.0134967598467242, 'epoch': 3.0})

In [80]:
#Evaluate the model
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 255
  Batch size = 8


{'eval_loss': 0.01217750832438469,
 'eval_rmse': 0.11035174876451492,
 'eval_pearsonr': 0.7761107852886949,
 'eval_runtime': 4.1049,
 'eval_samples_per_second': 62.12,
 'eval_steps_per_second': 7.796,
 'epoch': 3.0}

# Testing the Final Model on the test dataset

In [81]:
trainer.save_model("distilbert_classification")

Saving model checkpoint to distilbert_classification
Configuration saved in distilbert_classification/config.json
Model weights saved in distilbert_classification/pytorch_model.bin


In [82]:
predictions = trainer.predict(tokenized_test_dataset)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1002
  Batch size = 8


In [83]:
type(predictions)

transformers.trainer_utils.PredictionOutput

In [84]:
predictions[0]

array([[0.53839153],
       [0.45939144],
       [0.45762992],
       ...,
       [0.46221837],
       [0.7500934 ],
       [0.52689624]], dtype=float32)

In [85]:
predictions[1]

array([0.734, 0.422, 0.663, ..., 0.424, 0.597, 0.547], dtype=float32)

In [86]:
predictions[2]

{'test_loss': 0.015755170956254005,
 'test_rmse': 0.12551960349082947,
 'test_pearsonr': 0.7619375508507787,
 'test_runtime': 16.4978,
 'test_samples_per_second': 60.735,
 'test_steps_per_second': 7.637}

In [87]:
trainer.eval_dataset=tokenized_test_dataset

In [88]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1002
  Batch size = 8


{'eval_loss': 0.015755170956254005,
 'eval_rmse': 0.12551960349082947,
 'eval_pearsonr': 0.7619375508507787,
 'eval_runtime': 16.2302,
 'eval_samples_per_second': 61.737,
 'eval_steps_per_second': 7.763,
 'epoch': 3.0}