In [1]:
import os
from datetime import datetime
import re
import numpy as np
import pandas as pd
from sklearn.metrics import  f1_score, accuracy_score
from bs4 import BeautifulSoup
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModelForSequenceClassification, AdamW
import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS device is available.")
else:
    print("MPS device is not available, using CPU instead.")

MPS device is available.


# Chargement de la donnée

In [3]:
# Load train and test datasets
df_train = pd.read_csv('stackoverflow_questions_cleaned_train.csv')
df_test = pd.read_csv('stackoverflow_questions_cleaned_test.csv')


In [4]:
df_train

Unnamed: 0,date,title,tags,score,answer_count,sentence_bow,sentence_bow_lem,sentence_dl,sentence_use
0,2024-06-07 12:53:39,JSF Composite Components: Facet Content Not Re...,"['jsf', 'primefaces', 'xhtml', 'facet', 'compo...",2,1,jsf composite components facet content not ren...,jsf composite component facet content not rend...,jsf composite components facet content not ren...,JSF Composite Components: Facet Content Not Re...
1,2024-07-11 16:40:13,bootstrap react Accordion - collapse / expand ...,"['reactjs', 'bootstrap-5', 'accordion']",1,1,bootstrap react accordion collapse expand via ...,bootstrap react accordion collapse expand via ...,bootstrap react accordion collapse expand via ...,bootstrap react Accordion - collapse / expand ...
2,2024-06-13 21:50:46,What is the internal data bus?,"['cpu-architecture', '6502']",1,1,what internal data bus,what internal data bus,what is the internal data bus,What is the internal data bus?
3,2024-06-21 19:53:39,Efficiently Marking Holidays in a Data Column,"['python', 'dataframe', 'python-polars', 'pyth...",5,2,efficiently marking holidays data column,efficiently marking holiday data column,efficiently marking holidays in a data column,Efficiently Marking Holidays in a Data Column
4,2024-06-11 20:44:14,How to visualize CNN architecture using draw_c...,"['python', 'architecture', 'conv-neural-networ...",1,2,how visualize cnn architecture using drawconvet,how visualize cnn architecture using drawconvet,how to visualize cnn architecture using drawco...,How to visualize CNN architecture using draw_c...
...,...,...,...,...,...,...,...,...,...
8031,2024-06-18 14:15:11,How to implement a copy of a subsystem but wit...,['simulink'],1,1,how implement copy subsystem different values ...,how implement copy subsystem different value c...,how to implement a copy of a subsystem but wit...,How to implement a copy of a subsystem but wit...
8032,2024-06-20 13:52:36,What syntax do I need to make the compiler put...,"['c', 'mips', 'reverse-engineering', 'metrower...",3,4,what syntax need make compiler put variable stack,what syntax need make compiler put variable stack,what syntax do i need to make the compiler put...,What syntax do I need to make the compiler put...
8033,2024-06-19 18:26:39,Powershell Problem in passing a variable to $m...,"['powershell', 'outlook', 'windows-11']",2,1,powershell problem passing variable mailhtmlbo...,powershell problem passing variable mailhtmlbo...,powershell problem in passing a variable to ma...,Powershell Problem in passing a variable to $m...
8034,2024-07-11 15:30:35,How to save emails sent with Noticed gem,"['ruby-on-rails', 'rubygems']",1,1,how save emails sent noticed gem,how save email sent noticed gem,how to save emails sent with noticed gem,How to save emails sent with Noticed gem


In [5]:
# # Extract 'sentence_use' column
# df_train = df_train['sentence_use']
# df_test = df_test['sentence_use']

In [6]:
# # On convertit en liste car ça se perd quand on save en csv, puis on passe de liste à texte
# import ast
# df_train['text'] = df_train['sentence_use'].map(lambda X: ast.literal_eval(X)).map(lambda X: ' '.join(X))
# df_test['text'] = df_test['sentence_use'].map(lambda X: ast.literal_eval(X)).map(lambda X: ' '.join(X))

In [7]:
df_train['sentence_use'].iloc[0]

'JSF Composite Components: Facet Content Not Rendering Through Nested Components'

In [8]:
import ast

# On convertit en liste car ça se perd quand on save en csv
df_train['tags'] = df_train['tags'].map(lambda X: ast.literal_eval(X))
df_test['tags'] = df_test['tags'].map(lambda X: ast.literal_eval(X))

In [9]:
df_train

Unnamed: 0,date,title,tags,score,answer_count,sentence_bow,sentence_bow_lem,sentence_dl,sentence_use
0,2024-06-07 12:53:39,JSF Composite Components: Facet Content Not Re...,"[jsf, primefaces, xhtml, facet, composite-comp...",2,1,jsf composite components facet content not ren...,jsf composite component facet content not rend...,jsf composite components facet content not ren...,JSF Composite Components: Facet Content Not Re...
1,2024-07-11 16:40:13,bootstrap react Accordion - collapse / expand ...,"[reactjs, bootstrap-5, accordion]",1,1,bootstrap react accordion collapse expand via ...,bootstrap react accordion collapse expand via ...,bootstrap react accordion collapse expand via ...,bootstrap react Accordion - collapse / expand ...
2,2024-06-13 21:50:46,What is the internal data bus?,"[cpu-architecture, 6502]",1,1,what internal data bus,what internal data bus,what is the internal data bus,What is the internal data bus?
3,2024-06-21 19:53:39,Efficiently Marking Holidays in a Data Column,"[python, dataframe, python-polars, python-holi...",5,2,efficiently marking holidays data column,efficiently marking holiday data column,efficiently marking holidays in a data column,Efficiently Marking Holidays in a Data Column
4,2024-06-11 20:44:14,How to visualize CNN architecture using draw_c...,"[python, architecture, conv-neural-network, vi...",1,2,how visualize cnn architecture using drawconvet,how visualize cnn architecture using drawconvet,how to visualize cnn architecture using drawco...,How to visualize CNN architecture using draw_c...
...,...,...,...,...,...,...,...,...,...
8031,2024-06-18 14:15:11,How to implement a copy of a subsystem but wit...,[simulink],1,1,how implement copy subsystem different values ...,how implement copy subsystem different value c...,how to implement a copy of a subsystem but wit...,How to implement a copy of a subsystem but wit...
8032,2024-06-20 13:52:36,What syntax do I need to make the compiler put...,"[c, mips, reverse-engineering, metrowerks]",3,4,what syntax need make compiler put variable stack,what syntax need make compiler put variable stack,what syntax do i need to make the compiler put...,What syntax do I need to make the compiler put...
8033,2024-06-19 18:26:39,Powershell Problem in passing a variable to $m...,"[powershell, outlook, windows-11]",2,1,powershell problem passing variable mailhtmlbo...,powershell problem passing variable mailhtmlbo...,powershell problem in passing a variable to ma...,Powershell Problem in passing a variable to $m...
8034,2024-07-11 15:30:35,How to save emails sent with Noticed gem,"[ruby-on-rails, rubygems]",1,1,how save emails sent noticed gem,how save email sent noticed gem,how to save emails sent with noticed gem,How to save emails sent with Noticed gem


# Filtrage des tags (top50)

In [10]:
from collections import Counter

all_tags = [tag for sublist in df_train['tags'] for tag in sublist]

tag_counts = Counter(all_tags)

In [11]:
all_tags

['jsf',
 'primefaces',
 'xhtml',
 'facet',
 'composite-component',
 'reactjs',
 'bootstrap-5',
 'accordion',
 'cpu-architecture',
 '6502',
 'python',
 'dataframe',
 'python-polars',
 'python-holidays',
 'python',
 'architecture',
 'conv-neural-network',
 'visualization',
 'structure',
 'android',
 'android-studio',
 'android-sdk-tools',
 'android-sdk-manager',
 'java',
 'intellij-idea',
 'intellij-plugin',
 'python',
 'matplotlib',
 'matrix',
 'bezier',
 'surface',
 'python',
 'google-colaboratory',
 'altair',
 'clamp',
 'python',
 'pdb',
 'c',
 'memory-alignment',
 '24-bit',
 'python',
 'list',
 'android',
 'kotlin',
 'android-jetpack-compose',
 'android-jetpack-compose-material3',
 'flutter',
 'dart',
 'excel',
 'excel-formula',
 'python',
 'sqlalchemy',
 'android',
 'kotlin',
 'android-jetpack-compose',
 'angular',
 'typescript',
 'powerbi',
 'powerbi-embedded',
 'power-bi-angular',
 'flutter',
 'angular',
 'signals',
 'ngrx',
 'android',
 'android-jetpack-compose',
 'kubernetes',
 

In [12]:
# top 50
tag_counts.most_common(50)

[('python', 1246),
 ('javascript', 524),
 ('c#', 400),
 ('r', 381),
 ('c++', 375),
 ('angular', 352),
 ('java', 301),
 ('typescript', 270),
 ('reactjs', 268),
 ('c', 248),
 ('html', 248),
 ('css', 247),
 ('android', 241),
 ('pandas', 213),
 ('sql', 184),
 ('dataframe', 175),
 ('excel', 173),
 ('php', 162),
 ('kotlin', 156),
 ('flutter', 147),
 ('postgresql', 141),
 ('swift', 135),
 ('node.js', 126),
 ('powershell', 121),
 ('django', 118),
 ('.net', 113),
 ('ios', 107),
 ('android-jetpack-compose', 102),
 ('go', 101),
 ('numpy', 99),
 ('regex', 99),
 ('azure', 95),
 ('json', 93),
 ('spring-boot', 89),
 ('asp.net-core', 87),
 ('algorithm', 85),
 ('vba', 84),
 ('arrays', 83),
 ('python-3.x', 82),
 ('swiftui', 82),
 ('next.js', 81),
 ('rust', 80),
 ('react-native', 75),
 ('excel-formula', 72),
 ('ggplot2', 71),
 ('python-polars', 70),
 ('vue.js', 69),
 ('docker', 69),
 ('azure-devops', 68),
 ('visual-studio-code', 68)]

In [13]:
top_50_tags = [tag for tag, count in tag_counts.most_common(50)]

In [14]:
def filter_tags(tags):
    return [tag for tag in tags if tag in top_50_tags]

# On filtre pour ne garder que le top 50
df_train['tags50'] = df_train['tags'].apply(filter_tags)
df_test['tags50'] = df_test['tags'].apply(filter_tags)

In [15]:
df_train['tags50']

0                                       []
1                                [reactjs]
2                                       []
3       [python, dataframe, python-polars]
4                                 [python]
                       ...                
8031                                    []
8032                                   [c]
8033                          [powershell]
8034                                    []
8035                     [sql, postgresql]
Name: tags50, Length: 8036, dtype: object

In [16]:
# Remove rows without tags in the training set
df_train = df_train[df_train['tags50'].map(len) > 0]

In [17]:
df_train['tags50']

1                                [reactjs]
3       [python, dataframe, python-polars]
4                                 [python]
5                                [android]
6                                   [java]
                       ...                
8029                             [flutter]
8030                          [javascript]
8032                                   [c]
8033                          [powershell]
8035                     [sql, postgresql]
Name: tags50, Length: 6304, dtype: object

In [18]:
df_train['sentence_use'].iloc[3]

'Failed to find Platform SDK with path: platforms;android-35'

# Selection du device pour deep learning et chargement du tokenizer

In [19]:
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'mps'

In [20]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Encodage de la target

In [21]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
encoded_tags_train = mlb.fit_transform(df_train['tags50'])

In [22]:
encoded_tags_test = mlb.transform(df_test['tags50'])

In [23]:
encoded_tags_train.shape

(6304, 50)

In [24]:
mlb.classes_.shape[0]

50

In [25]:
df_train_array = pd.DataFrame(encoded_tags_train, columns=[f'tag_{i}' for i in range(50)])

In [26]:
df_train_array

Unnamed: 0,tag_0,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8,tag_9,tag_10,tag_11,tag_12,tag_13,tag_14,tag_15,tag_16,tag_17,tag_18,tag_19,tag_20,tag_21,tag_22,tag_23,tag_24,tag_25,tag_26,tag_27,tag_28,tag_29,tag_30,tag_31,tag_32,tag_33,tag_34,tag_35,tag_36,tag_37,tag_38,tag_39,tag_40,tag_41,tag_42,tag_43,tag_44,tag_45,tag_46,tag_47,tag_48,tag_49
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6299,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6300,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6301,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6302,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
df_test_array = pd.DataFrame(encoded_tags_test, columns=[f'tag_{i}' for i in range(50)])

In [28]:
df_train.shape

(6304, 10)

In [29]:
df_train.reset_index(drop=True, inplace=True)

In [30]:
# On ajoute les encodage de la target à df
df_train = pd.concat([df_train, df_train_array], axis=1)
df_test = pd.concat([df_test, df_test_array], axis=1)

In [31]:
df_train.shape

(6304, 60)

In [32]:
# target_cols will be columns resulting from the MultiLabelBinarizer encoding of the tag column 
target_cols = [f'tag_{i}' for i in range(50)]
try:
    print(target_cols)
except NameError:
    print("/!\\ WARNING /!\\")
    print("Ceci est un notebook d'exemple, VOUS DEVEZ DEFINIR target_cols EN FONCTION DU RESULTAT DE VOTRE MULTILABELBINARIZER DANS LE NOTEBOOK DE PREPROCESSING")
    raise NameError("target_cols is not defined")

['tag_0', 'tag_1', 'tag_2', 'tag_3', 'tag_4', 'tag_5', 'tag_6', 'tag_7', 'tag_8', 'tag_9', 'tag_10', 'tag_11', 'tag_12', 'tag_13', 'tag_14', 'tag_15', 'tag_16', 'tag_17', 'tag_18', 'tag_19', 'tag_20', 'tag_21', 'tag_22', 'tag_23', 'tag_24', 'tag_25', 'tag_26', 'tag_27', 'tag_28', 'tag_29', 'tag_30', 'tag_31', 'tag_32', 'tag_33', 'tag_34', 'tag_35', 'tag_36', 'tag_37', 'tag_38', 'tag_39', 'tag_40', 'tag_41', 'tag_42', 'tag_43', 'tag_44', 'tag_45', 'tag_46', 'tag_47', 'tag_48', 'tag_49']


In [33]:
# Définir target_cols en fonction des classes trouvées
target_cols_real = mlb.classes_.tolist()

print(target_cols_real)

['.net', 'algorithm', 'android', 'android-jetpack-compose', 'angular', 'arrays', 'asp.net-core', 'azure', 'azure-devops', 'c', 'c#', 'c++', 'css', 'dataframe', 'django', 'docker', 'excel', 'excel-formula', 'flutter', 'ggplot2', 'go', 'html', 'ios', 'java', 'javascript', 'json', 'kotlin', 'next.js', 'node.js', 'numpy', 'pandas', 'php', 'postgresql', 'powershell', 'python', 'python-3.x', 'python-polars', 'r', 'react-native', 'reactjs', 'regex', 'rust', 'spring-boot', 'sql', 'swift', 'swiftui', 'typescript', 'vba', 'visual-studio-code', 'vue.js']


# Conversion de la data dans un format adapté au modele huggingface

In [34]:
from sklearn.model_selection import train_test_split

df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)

In [35]:
from datasets import Dataset
# Create dataset
train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)

# Encode text
train_encodings = tokenizer(train_dataset['sentence_use'], truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_dataset['sentence_use'], truncation=True, padding=True, max_length=512)

In [36]:
class BertProcessedDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32).to(device)
            return item

        def __len__(self):
            return len(self.labels)
        
# Convert encodings to PyTorch tensors
train_dataset = BertProcessedDataset(train_encodings, df_train[target_cols].values)
valid_dataset = BertProcessedDataset(val_encodings, df_val[target_cols].values)

# Chargement du modele dans la device

In [37]:

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=mlb.classes_.shape[0], 
    problem_type="multi_label_classification"
)
model.to(device);

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
device

'mps'

# Préparation du Wrapper

In [39]:
import numpy as np
import torch
from sklearn.metrics import f1_score, accuracy_score, jaccard_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    final_metrics = {}
    
    # Apply sigmoid to the logits
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(logits))
    predictions = np.zeros(probs.shape)
    predictions[np.where(probs >= 0.5)] = 1
    
    # The global f1 metrics
    final_metrics["f1_micro"] = f1_score(labels, predictions, average="micro")
    final_metrics["f1_macro"] = f1_score(labels, predictions, average="macro")
    final_metrics["f1_weighted"] = f1_score(labels, predictions, average="weighted")
    
    # MeanIoU
    final_metrics["mean_iou"] = jaccard_score(labels, predictions, average="macro")
    
    # Accuracy
    final_metrics["accuracy"] = accuracy_score(labels, predictions)
    
    return final_metrics

In [40]:
now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
os.makedirs(f"./{now}-bert-model", exist_ok=True)

In [41]:
# Defining some key variables that will be used later on in the training
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 1e-4

In [42]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

args = TrainingArguments(
    output_dir = f"./{now}-bert-model",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01, ## Améliore l'entrainement mais marginal
    load_best_model_at_end=True,
    metric_for_best_model='f1_weighted',
    greater_is_better=True,
    eval_accumulation_steps=50, ## Permet de libérer de la RAM après 50 évaluation
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.02)],
)

# Model fit

In [43]:
print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built())

True
True


In [44]:
import mlflow

# Create the experiment if it doesn't exist
experiment_name = "stackoverflow_multilabel_classification"
if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(experiment_name)

# Set the experiment
mlflow.set_experiment(experiment_name)


with mlflow.start_run(run_name="BERT_MultiLabel_Classification") as run:
    description = f"Training BERT for multilabel classification with learning rate {LEARNING_RATE} and {EPOCHS} epochs."
    mlflow.set_tag("mlflow.note.content", description)

    # Log parameters
    mlflow.log_param("model_name", "bert-base-uncased")
    mlflow.log_param("learning_rate", LEARNING_RATE)
    mlflow.log_param("epochs", EPOCHS)
    mlflow.log_param("train_batch_size", TRAIN_BATCH_SIZE)
    mlflow.log_param("valid_batch_size", VALID_BATCH_SIZE)
    mlflow.log_param("num_labels", mlb.classes_.shape[0])
    mlflow.log_param("device", device)

    # # # Model fit
    trainer.train()

    # Log model
    mlflow.pytorch.log_model(model, "model")

    # Log metrics from the best model
    best_metrics = trainer.evaluate()
    for key, value in best_metrics.items():
        mlflow.log_metric(key, value)

    id2label = {i: class_name for i, class_name in enumerate(mlb.classes_)}
    trainer.model.config.id2label = id2label
    
    # Save the model
    trainer.save_model(f"./{now}-bert-model")
    # mlflow.log_artifact(f"./{now}-bert-model")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'loss': 0.1382, 'grad_norm': 0.24452868103981018, 'learning_rate': 9.365482233502539e-05, 'epoch': 0.63}


                                                    
 10%|█         | 788/7880 [07:54<1:19:19,  1.49it/s]

{'eval_loss': 0.09047643095254898, 'eval_f1_micro': 0.09547738693467336, 'eval_f1_macro': 0.020998999461248363, 'eval_f1_weighted': 0.06769695851989206, 'eval_mean_iou': 0.015214635551714204, 'eval_accuracy': 0.23184079601990049, 'eval_runtime': 14.4637, 'eval_samples_per_second': 69.484, 'eval_steps_per_second': 8.711, 'epoch': 1.0}


 13%|█▎        | 1000/7880 [09:59<1:04:15,  1.78it/s]

{'loss': 0.1125, 'grad_norm': 0.33326929807662964, 'learning_rate': 8.730964467005075e-05, 'epoch': 1.27}


 19%|█▉        | 1500/7880 [14:42<59:55,  1.77it/s]  

{'loss': 0.0924, 'grad_norm': 0.3720664381980896, 'learning_rate': 8.096446700507615e-05, 'epoch': 1.9}


                                                     
 20%|██        | 1576/7880 [15:38<1:01:15,  1.72it/s]

{'eval_loss': 0.07203800976276398, 'eval_f1_micro': 0.37152103559870553, 'eval_f1_macro': 0.1965833913985432, 'eval_f1_weighted': 0.29486313160547056, 'eval_mean_iou': 0.14193743465196018, 'eval_accuracy': 0.32039800995024875, 'eval_runtime': 13.9766, 'eval_samples_per_second': 71.906, 'eval_steps_per_second': 9.015, 'epoch': 2.0}


 25%|██▌       | 2000/7880 [19:40<53:38,  1.83it/s]  

{'loss': 0.0752, 'grad_norm': 0.2831851541996002, 'learning_rate': 7.461928934010153e-05, 'epoch': 2.54}


                                                     
 30%|███       | 2364/7880 [23:27<51:01,  1.80it/s]

{'eval_loss': 0.06669244915246964, 'eval_f1_micro': 0.46162790697674416, 'eval_f1_macro': 0.31934921481378775, 'eval_f1_weighted': 0.4075152071708326, 'eval_mean_iou': 0.2284465880268102, 'eval_accuracy': 0.3701492537313433, 'eval_runtime': 13.7923, 'eval_samples_per_second': 72.867, 'eval_steps_per_second': 9.136, 'epoch': 3.0}


 32%|███▏      | 2500/7880 [24:48<48:51,  1.83it/s]  

{'loss': 0.0652, 'grad_norm': 0.33184000849723816, 'learning_rate': 6.82741116751269e-05, 'epoch': 3.17}


 38%|███▊      | 3000/7880 [29:29<48:40,  1.67it/s]

{'loss': 0.0563, 'grad_norm': 0.5147448182106018, 'learning_rate': 6.192893401015228e-05, 'epoch': 3.81}


                                                   
 40%|████      | 3152/7880 [31:09<43:20,  1.82it/s]

{'eval_loss': 0.06564856320619583, 'eval_f1_micro': 0.5182625863770978, 'eval_f1_macro': 0.45611793056858985, 'eval_f1_weighted': 0.49062784299046996, 'eval_mean_iou': 0.3279791330297225, 'eval_accuracy': 0.3800995024875622, 'eval_runtime': 14.026, 'eval_samples_per_second': 71.653, 'eval_steps_per_second': 8.983, 'epoch': 4.0}


 44%|████▍     | 3500/7880 [34:29<45:32,  1.60it/s]  

{'loss': 0.0475, 'grad_norm': 0.4855523407459259, 'learning_rate': 5.5583756345177663e-05, 'epoch': 4.44}


                                                   
 50%|█████     | 3940/7880 [38:51<37:01,  1.77it/s]

{'eval_loss': 0.06755435466766357, 'eval_f1_micro': 0.5056234718826406, 'eval_f1_macro': 0.4526108024018764, 'eval_f1_weighted': 0.47689925048173804, 'eval_mean_iou': 0.3212566652302605, 'eval_accuracy': 0.36218905472636814, 'eval_runtime': 13.953, 'eval_samples_per_second': 72.027, 'eval_steps_per_second': 9.03, 'epoch': 5.0}


 51%|█████     | 4000/7880 [39:28<36:40,  1.76it/s]  

{'loss': 0.0419, 'grad_norm': 0.2320244163274765, 'learning_rate': 4.9238578680203045e-05, 'epoch': 5.08}


 57%|█████▋    | 4500/7880 [44:09<30:52,  1.82it/s]

{'loss': 0.0339, 'grad_norm': 0.31718742847442627, 'learning_rate': 4.289340101522843e-05, 'epoch': 5.71}


                                                   
 60%|██████    | 4728/7880 [46:31<29:16,  1.79it/s]

{'eval_loss': 0.06719451397657394, 'eval_f1_micro': 0.540952380952381, 'eval_f1_macro': 0.5242017543691642, 'eval_f1_weighted': 0.5246147642499209, 'eval_mean_iou': 0.38004705485939816, 'eval_accuracy': 0.3970149253731343, 'eval_runtime': 13.8628, 'eval_samples_per_second': 72.496, 'eval_steps_per_second': 9.089, 'epoch': 6.0}


 63%|██████▎   | 5000/7880 [49:07<28:34,  1.68it/s]  

{'loss': 0.0313, 'grad_norm': 0.2359836846590042, 'learning_rate': 3.654822335025381e-05, 'epoch': 6.35}


 70%|██████▉   | 5500/7880 [53:47<21:36,  1.84it/s]

{'loss': 0.0267, 'grad_norm': 0.23694442212581635, 'learning_rate': 3.020304568527919e-05, 'epoch': 6.98}


                                                   
 70%|███████   | 5516/7880 [54:10<21:29,  1.83it/s]

{'eval_loss': 0.06846260279417038, 'eval_f1_micro': 0.5342925659472422, 'eval_f1_macro': 0.5249254828473583, 'eval_f1_weighted': 0.5202118005411928, 'eval_mean_iou': 0.38396652531479236, 'eval_accuracy': 0.38606965174129354, 'eval_runtime': 13.961, 'eval_samples_per_second': 71.986, 'eval_steps_per_second': 9.025, 'epoch': 7.0}


 76%|███████▌  | 6000/7880 [58:45<17:10,  1.82it/s]  

{'loss': 0.0227, 'grad_norm': 0.21346376836299896, 'learning_rate': 2.385786802030457e-05, 'epoch': 7.61}


                                                     
 80%|████████  | 6304/7880 [1:01:49<15:02,  1.75it/s]

{'eval_loss': 0.07001227140426636, 'eval_f1_micro': 0.5389334591788579, 'eval_f1_macro': 0.5218675167658745, 'eval_f1_weighted': 0.5240037242837978, 'eval_mean_iou': 0.3781798852502082, 'eval_accuracy': 0.3880597014925373, 'eval_runtime': 14.0669, 'eval_samples_per_second': 71.444, 'eval_steps_per_second': 8.957, 'epoch': 8.0}


 82%|████████▏ | 6500/7880 [1:03:46<12:50,  1.79it/s]  

{'loss': 0.0203, 'grad_norm': 0.48248565196990967, 'learning_rate': 1.751269035532995e-05, 'epoch': 8.25}


 89%|████████▉ | 7000/7880 [1:08:25<08:07,  1.81it/s]

{'loss': 0.0191, 'grad_norm': 0.15533439815044403, 'learning_rate': 1.116751269035533e-05, 'epoch': 8.88}


                                                     
 90%|█████████ | 7092/7880 [1:09:31<07:08,  1.84it/s]

{'eval_loss': 0.0696287453174591, 'eval_f1_micro': 0.5428296438883542, 'eval_f1_macro': 0.5334343094291728, 'eval_f1_weighted': 0.5291344435211136, 'eval_mean_iou': 0.3897917984399232, 'eval_accuracy': 0.4079601990049751, 'eval_runtime': 14.1471, 'eval_samples_per_second': 71.039, 'eval_steps_per_second': 8.906, 'epoch': 9.0}


 95%|█████████▌| 7500/7880 [1:13:22<03:50,  1.65it/s]  

{'loss': 0.0163, 'grad_norm': 0.18391187489032745, 'learning_rate': 4.822335025380711e-06, 'epoch': 9.52}


                                                     
100%|██████████| 7880/7880 [1:17:12<00:00,  1.84it/s]

{'eval_loss': 0.07155022770166397, 'eval_f1_micro': 0.5448598130841121, 'eval_f1_macro': 0.5432394333058009, 'eval_f1_weighted': 0.5360678993947966, 'eval_mean_iou': 0.39515003173755386, 'eval_accuracy': 0.39800995024875624, 'eval_runtime': 13.4176, 'eval_samples_per_second': 74.901, 'eval_steps_per_second': 9.391, 'epoch': 10.0}


100%|██████████| 7880/7880 [1:17:15<00:00,  1.70it/s]


{'train_runtime': 4635.9277, 'train_samples_per_second': 13.598, 'train_steps_per_second': 1.7, 'train_loss': 0.05149091754467959, 'epoch': 10.0}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 126/126 [00:12<00:00, 10.27it/s]


# Test

In [45]:
test_dataset = Dataset.from_pandas(df_test)
test_encodings = tokenizer(test_dataset['sentence_use'], truncation=True, padding=True, max_length=512)
test_dataset = BertProcessedDataset(test_encodings, df_test[target_cols].values)

In [46]:
pred_otps = trainer.predict(test_dataset)

100%|██████████| 126/126 [00:14<00:00,  8.84it/s]


In [47]:
pred_otps.metrics

{'test_loss': 0.06701730936765671,
 'test_f1_micro': 0.5705244122965641,
 'test_f1_macro': 0.5490506973873429,
 'test_f1_weighted': 0.5624224215275293,
 'test_mean_iou': 0.3976694958435059,
 'test_accuracy': 0.3890547263681592,
 'test_runtime': 14.815,
 'test_samples_per_second': 67.837,
 'test_steps_per_second': 8.505}