In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
from tqdm import tqdm
import json
import joblib
import numpy as np
import os

from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, KFold
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, TrainerCallback
import evaluate
import torch

tqdm.pandas()

# Preprocessamento dos dados

## Dataset inicial 

### Interaction data

In [2]:
df_interaction = pd.read_csv("interaction_data_250603.csv")
df_interaction

Unnamed: 0,correctTag,targetWord,eventType,orderingId,sessionId,draggedTag
0,Verbo,Será,interaction,0,xqEk5ZfKaG,Pronome
1,Verbo,Será,interaction,1,xqEk5ZfKaG,Conjunção Subordinativa
2,Pontuação,",",interaction,0,uQY36KP907,Pontuação
3,Pontuação,!,interaction,1,uQY36KP907,Pontuação
4,Substantivo,árvore,interaction,0,X2RK9EgWWZ,Substantivo
...,...,...,...,...,...,...
24810,Pontuação,",",interaction,0,Halvf7QSu2,Pontuação
24811,Pontuação,!,interaction,1,Halvf7QSu2,Pontuação
24812,Verbo de Ligação,fosse,interaction,2,Halvf7QSu2,Verbo
24813,Verbo,chegar,interaction,3,Halvf7QSu2,Verbo de Ligação


In [3]:
# Cria coluna indicando se houve erro de etiquetagem por parte do aluno
def check_for_mistakes(row):
    return row['correctTag'] != row['draggedTag']

df_interaction['isMistake'] = df_interaction.apply(check_for_mistakes, axis=1)

In [4]:
df_interaction

Unnamed: 0,correctTag,targetWord,eventType,orderingId,sessionId,draggedTag,isMistake
0,Verbo,Será,interaction,0,xqEk5ZfKaG,Pronome,True
1,Verbo,Será,interaction,1,xqEk5ZfKaG,Conjunção Subordinativa,True
2,Pontuação,",",interaction,0,uQY36KP907,Pontuação,False
3,Pontuação,!,interaction,1,uQY36KP907,Pontuação,False
4,Substantivo,árvore,interaction,0,X2RK9EgWWZ,Substantivo,False
...,...,...,...,...,...,...,...
24810,Pontuação,",",interaction,0,Halvf7QSu2,Pontuação,False
24811,Pontuação,!,interaction,1,Halvf7QSu2,Pontuação,False
24812,Verbo de Ligação,fosse,interaction,2,Halvf7QSu2,Verbo,True
24813,Verbo,chegar,interaction,3,Halvf7QSu2,Verbo de Ligação,True


In [5]:
df_interaction.isMistake.value_counts()

True     13599
False    11216
Name: isMistake, dtype: int64

In [6]:
df_interaction.isMistake.value_counts(True)

True     0.548015
False    0.451985
Name: isMistake, dtype: float64

In [7]:
df_interaction.to_pickle("interaction_data_250603.pkl")

### Session data

In [9]:
df_session = pd.read_csv("new_session_data_250603.csv")
df_session

Unnamed: 0,startTime,serie,words,eventType,sessionId
0,2025-05-27 00:49:01.547000+00:00,5-ensino-fundamental,"[{'tag': 'Verbo', 'word': 'Será'}, {'tag': 'Co...",new-session,xqEk5ZfKaG
1,2025-05-27 01:01:42.509000+00:00,5-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,uQY36KP907
2,2025-05-27 11:11:54.325000+00:00,6-ensino-fundamental,"[{'tag': 'Artigo', 'word': 'A'}, {'tag': 'Subs...",new-session,X2RK9EgWWZ
3,2025-05-27 11:12:37.255000+00:00,2-ensino-medio,"[{'tag': 'Pronome', 'word': 'Você'}, {'tag': '...",new-session,IONQQWNtUo
4,2025-05-27 12:21:30.712000+00:00,1-ensino-medio,"[{'tag': 'Advérbio', 'word': 'Depois'}, {'tag'...",new-session,brmbB7P32D
...,...,...,...,...,...
1927,2025-06-03 16:46:46.587000+00:00,7-ensino-fundamental,"[{'tag': 'Artigo', 'word': 'O'}, {'tag': 'Subs...",new-session,asyXKyUXZY
1928,2025-06-03 16:47:40.076000+00:00,7-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Maria'}, {'t...",new-session,wb93AUnw5X
1929,2025-06-03 16:48:19.059000+00:00,9-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,Halvf7QSu2
1930,2025-06-03 16:49:24.637000+00:00,4-ensino-fundamental,"[{'tag': 'Interjeição', 'word': 'Meu'}, {'tag'...",new-session,-K94pJeV9q


In [10]:
# Criando coluna com as frases originais em si
df_session['words'] = df_session['words'].progress_apply(lambda x: json.loads(x.replace("'", '"')))
df_session['sentence'] = df_session['words'].progress_apply(lambda x: ' '.join([token_info['word'] for token_info in x]))

100%|██████████| 1932/1932 [00:00<00:00, 14533.78it/s]
100%|██████████| 1932/1932 [00:00<00:00, 191361.53it/s]


In [11]:
df_session

Unnamed: 0,startTime,serie,words,eventType,sessionId,sentence
0,2025-05-27 00:49:01.547000+00:00,5-ensino-fundamental,"[{'tag': 'Verbo', 'word': 'Será'}, {'tag': 'Co...",new-session,xqEk5ZfKaG,Será que alguém explicou o que aconteceu na au...
1,2025-05-27 01:01:42.509000+00:00,5-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,uQY36KP907,"Ufa , achei que não fosse chegar a tempo !"
2,2025-05-27 11:11:54.325000+00:00,6-ensino-fundamental,"[{'tag': 'Artigo', 'word': 'A'}, {'tag': 'Subs...",new-session,X2RK9EgWWZ,A folha caiu da árvore .
3,2025-05-27 11:12:37.255000+00:00,2-ensino-medio,"[{'tag': 'Pronome', 'word': 'Você'}, {'tag': '...",new-session,IONQQWNtUo,Você já terminou o dever de casa ?
4,2025-05-27 12:21:30.712000+00:00,1-ensino-medio,"[{'tag': 'Advérbio', 'word': 'Depois'}, {'tag'...",new-session,brmbB7P32D,"Depois que o sol se pôs , o frio começou a aum..."
...,...,...,...,...,...,...
1927,2025-06-03 16:46:46.587000+00:00,7-ensino-fundamental,"[{'tag': 'Artigo', 'word': 'O'}, {'tag': 'Subs...",new-session,asyXKyUXZY,O diretor aprovou a ideia dela .
1928,2025-06-03 16:47:40.076000+00:00,7-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Maria'}, {'t...",new-session,wb93AUnw5X,Maria foi à escola com João .
1929,2025-06-03 16:48:19.059000+00:00,9-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,Halvf7QSu2,"Ufa , achei que não fosse chegar a tempo !"
1930,2025-06-03 16:49:24.637000+00:00,4-ensino-fundamental,"[{'tag': 'Interjeição', 'word': 'Meu'}, {'tag'...",new-session,-K94pJeV9q,"Meu caro João , espero que esteja se sentindo ..."


In [12]:
df_session.to_pickle('session_dat250603.pkl')

### Juntando os dataframes

In [13]:
df_session = pd.read_pickle("session_data_250603.pkl")
df_interaction = pd.read_pickle("interaction_data_250603.pkl")

FileNotFoundError: [Errno 2] No such file or directory: 'session_data_250603.pkl'

In [14]:
df_session

Unnamed: 0,startTime,serie,words,eventType,sessionId,sentence
0,2025-05-27 00:49:01.547000+00:00,5-ensino-fundamental,"[{'tag': 'Verbo', 'word': 'Será'}, {'tag': 'Co...",new-session,xqEk5ZfKaG,Será que alguém explicou o que aconteceu na au...
1,2025-05-27 01:01:42.509000+00:00,5-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,uQY36KP907,"Ufa , achei que não fosse chegar a tempo !"
2,2025-05-27 11:11:54.325000+00:00,6-ensino-fundamental,"[{'tag': 'Artigo', 'word': 'A'}, {'tag': 'Subs...",new-session,X2RK9EgWWZ,A folha caiu da árvore .
3,2025-05-27 11:12:37.255000+00:00,2-ensino-medio,"[{'tag': 'Pronome', 'word': 'Você'}, {'tag': '...",new-session,IONQQWNtUo,Você já terminou o dever de casa ?
4,2025-05-27 12:21:30.712000+00:00,1-ensino-medio,"[{'tag': 'Advérbio', 'word': 'Depois'}, {'tag'...",new-session,brmbB7P32D,"Depois que o sol se pôs , o frio começou a aum..."
...,...,...,...,...,...,...
1927,2025-06-03 16:46:46.587000+00:00,7-ensino-fundamental,"[{'tag': 'Artigo', 'word': 'O'}, {'tag': 'Subs...",new-session,asyXKyUXZY,O diretor aprovou a ideia dela .
1928,2025-06-03 16:47:40.076000+00:00,7-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Maria'}, {'t...",new-session,wb93AUnw5X,Maria foi à escola com João .
1929,2025-06-03 16:48:19.059000+00:00,9-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,Halvf7QSu2,"Ufa , achei que não fosse chegar a tempo !"
1930,2025-06-03 16:49:24.637000+00:00,4-ensino-fundamental,"[{'tag': 'Interjeição', 'word': 'Meu'}, {'tag'...",new-session,-K94pJeV9q,"Meu caro João , espero que esteja se sentindo ..."


In [15]:
df_interaction

Unnamed: 0,correctTag,targetWord,eventType,orderingId,sessionId,draggedTag,isMistake
0,Verbo,Será,interaction,0,xqEk5ZfKaG,Pronome,True
1,Verbo,Será,interaction,1,xqEk5ZfKaG,Conjunção Subordinativa,True
2,Pontuação,",",interaction,0,uQY36KP907,Pontuação,False
3,Pontuação,!,interaction,1,uQY36KP907,Pontuação,False
4,Substantivo,árvore,interaction,0,X2RK9EgWWZ,Substantivo,False
...,...,...,...,...,...,...,...
24810,Pontuação,",",interaction,0,Halvf7QSu2,Pontuação,False
24811,Pontuação,!,interaction,1,Halvf7QSu2,Pontuação,False
24812,Verbo de Ligação,fosse,interaction,2,Halvf7QSu2,Verbo,True
24813,Verbo,chegar,interaction,3,Halvf7QSu2,Verbo de Ligação,True


In [16]:
# Join dos datasets em sessionId, mantendo as chaves da tabela de interações (maior)
df = pd.merge(df_interaction, df_session, how="left", left_on="sessionId", right_on="sessionId")
df

Unnamed: 0,correctTag,targetWord,eventType_x,orderingId,sessionId,draggedTag,isMistake,startTime,serie,words,eventType_y,sentence
0,Verbo,Será,interaction,0,xqEk5ZfKaG,Pronome,True,2025-05-27 00:49:01.547000+00:00,5-ensino-fundamental,"[{'tag': 'Verbo', 'word': 'Será'}, {'tag': 'Co...",new-session,Será que alguém explicou o que aconteceu na au...
1,Verbo,Será,interaction,1,xqEk5ZfKaG,Conjunção Subordinativa,True,2025-05-27 00:49:01.547000+00:00,5-ensino-fundamental,"[{'tag': 'Verbo', 'word': 'Será'}, {'tag': 'Co...",new-session,Será que alguém explicou o que aconteceu na au...
2,Pontuação,",",interaction,0,uQY36KP907,Pontuação,False,2025-05-27 01:01:42.509000+00:00,5-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !"
3,Pontuação,!,interaction,1,uQY36KP907,Pontuação,False,2025-05-27 01:01:42.509000+00:00,5-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !"
4,Substantivo,árvore,interaction,0,X2RK9EgWWZ,Substantivo,False,2025-05-27 11:11:54.325000+00:00,6-ensino-fundamental,"[{'tag': 'Artigo', 'word': 'A'}, {'tag': 'Subs...",new-session,A folha caiu da árvore .
...,...,...,...,...,...,...,...,...,...,...,...,...
24810,Pontuação,",",interaction,0,Halvf7QSu2,Pontuação,False,2025-06-03 16:48:19.059000+00:00,9-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !"
24811,Pontuação,!,interaction,1,Halvf7QSu2,Pontuação,False,2025-06-03 16:48:19.059000+00:00,9-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !"
24812,Verbo de Ligação,fosse,interaction,2,Halvf7QSu2,Verbo,True,2025-06-03 16:48:19.059000+00:00,9-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !"
24813,Verbo,chegar,interaction,3,Halvf7QSu2,Verbo de Ligação,True,2025-06-03 16:48:19.059000+00:00,9-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !"


In [17]:
df_unique_sessions = df.groupby('sessionId').last()
df_unique_sessions

Unnamed: 0_level_0,correctTag,targetWord,eventType_x,orderingId,draggedTag,isMistake,startTime,serie,words,eventType_y,sentence
sessionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
-3Q1xJXqzA,Adjetivo,econômica,interaction,38,Adjetivo,False,2025-05-30 20:06:07.020000+00:00,5-ensino-fundamental,"[{'tag': 'Conjunção Subordinativa', 'word': 'E...",new-session,"Embora o projeto tenha sido aprovado , muitos ..."
-6gZ-mHdzv,Substantivo,ideia,interaction,7,Substantivo,False,2025-05-29 01:28:34.290000+00:00,9-ensino-fundamental,"[{'tag': 'Artigo', 'word': 'O'}, {'tag': 'Subs...",new-session,O diretor aprovou a ideia dela .
-CS8WhEL5b,Conjunção Subordinativa,Embora,interaction,14,Conjunção Subordinativa,False,2025-05-29 21:48:03.660000+00:00,3-ensino-medio,"[{'tag': 'Conjunção Subordinativa', 'word': 'E...",new-session,"Embora a chuva tenha parado , o chão ainda est..."
-D4ip2o6o3,Advérbio,já,interaction,38,Advérbio,False,2025-06-02 14:14:54.043000+00:00,8-ensino-fundamental,"[{'tag': 'Pronome', 'word': 'Ela'}, {'tag': 'V...",new-session,Ela perguntou se eu já havia terminado a lição...
-FzZZhJbxP,Verbo,abertos,interaction,18,Verbo,False,2025-05-28 22:17:08.503000+00:00,3-ensino-medio,"[{'tag': 'Pronome', 'word': 'Ela'}, {'tag': 'V...",new-session,Ela estava tão cansada que mal conseguia mante...
...,...,...,...,...,...,...,...,...,...,...,...
zuZgjJCnxQ,Adjetivo,escuro,interaction,14,Adjetivo,False,2025-05-30 20:12:49.957000+00:00,5-ensino-fundamental,"[{'tag': 'Artigo', 'word': 'A'}, {'tag': 'Subs...",new-session,A lua estava linda no céu escuro .
zyUGx8O6BL,Pontuação,.,interaction,11,Pontuação,False,2025-05-30 00:13:21.324000+00:00,8-ensino-fundamental,"[{'tag': 'Conjunção Subordinativa', 'word': 'E...",new-session,"Enquanto ele falava , todos prestavam muita at..."
zym_w_hcsK,Verbo,traga,interaction,28,Verbo,False,2025-05-30 19:45:23.592000+00:00,5-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'João'}, {'ta...",new-session,"João , por favor , traga o seu caderno para a ..."
zz_iWJzUKL,Pontuação,.,interaction,8,Pontuação,False,2025-05-29 16:59:24.770000+00:00,6-ensino-fundamental,"[{'tag': 'Artigo', 'word': 'A'}, {'tag': 'Subs...",new-session,A moça que canta chegou agora .


In [18]:
def check_for_complete_tries(row):
    return len(row['words']) == row['orderingId']

df_unique_sessions['complete_try'] = df_unique_sessions.apply(check_for_complete_tries, axis=1)

In [19]:
df_unique_sessions

Unnamed: 0_level_0,correctTag,targetWord,eventType_x,orderingId,draggedTag,isMistake,startTime,serie,words,eventType_y,sentence,complete_try
sessionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-3Q1xJXqzA,Adjetivo,econômica,interaction,38,Adjetivo,False,2025-05-30 20:06:07.020000+00:00,5-ensino-fundamental,"[{'tag': 'Conjunção Subordinativa', 'word': 'E...",new-session,"Embora o projeto tenha sido aprovado , muitos ...",False
-6gZ-mHdzv,Substantivo,ideia,interaction,7,Substantivo,False,2025-05-29 01:28:34.290000+00:00,9-ensino-fundamental,"[{'tag': 'Artigo', 'word': 'O'}, {'tag': 'Subs...",new-session,O diretor aprovou a ideia dela .,True
-CS8WhEL5b,Conjunção Subordinativa,Embora,interaction,14,Conjunção Subordinativa,False,2025-05-29 21:48:03.660000+00:00,3-ensino-medio,"[{'tag': 'Conjunção Subordinativa', 'word': 'E...",new-session,"Embora a chuva tenha parado , o chão ainda est...",False
-D4ip2o6o3,Advérbio,já,interaction,38,Advérbio,False,2025-06-02 14:14:54.043000+00:00,8-ensino-fundamental,"[{'tag': 'Pronome', 'word': 'Ela'}, {'tag': 'V...",new-session,Ela perguntou se eu já havia terminado a lição...,False
-FzZZhJbxP,Verbo,abertos,interaction,18,Verbo,False,2025-05-28 22:17:08.503000+00:00,3-ensino-medio,"[{'tag': 'Pronome', 'word': 'Ela'}, {'tag': 'V...",new-session,Ela estava tão cansada que mal conseguia mante...,False
...,...,...,...,...,...,...,...,...,...,...,...,...
zuZgjJCnxQ,Adjetivo,escuro,interaction,14,Adjetivo,False,2025-05-30 20:12:49.957000+00:00,5-ensino-fundamental,"[{'tag': 'Artigo', 'word': 'A'}, {'tag': 'Subs...",new-session,A lua estava linda no céu escuro .,False
zyUGx8O6BL,Pontuação,.,interaction,11,Pontuação,False,2025-05-30 00:13:21.324000+00:00,8-ensino-fundamental,"[{'tag': 'Conjunção Subordinativa', 'word': 'E...",new-session,"Enquanto ele falava , todos prestavam muita at...",False
zym_w_hcsK,Verbo,traga,interaction,28,Verbo,False,2025-05-30 19:45:23.592000+00:00,5-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'João'}, {'ta...",new-session,"João , por favor , traga o seu caderno para a ...",False
zz_iWJzUKL,Pontuação,.,interaction,8,Pontuação,False,2025-05-29 16:59:24.770000+00:00,6-ensino-fundamental,"[{'tag': 'Artigo', 'word': 'A'}, {'tag': 'Subs...",new-session,A moça que canta chegou agora .,False


In [20]:
df_unique_sessions.complete_try.value_counts()

False    1337
True       94
Name: complete_try, dtype: int64

In [21]:
complete_tries = []
for index, row in tqdm(df.iterrows()):
    if df_unique_sessions.loc[row['sessionId']]['complete_try']:
        complete_tries.append('Sim')
    else:
        complete_tries.append('Não')

df['complete_try'] = complete_tries

24815it [00:02, 10702.37it/s]


In [22]:
df

Unnamed: 0,correctTag,targetWord,eventType_x,orderingId,sessionId,draggedTag,isMistake,startTime,serie,words,eventType_y,sentence,complete_try
0,Verbo,Será,interaction,0,xqEk5ZfKaG,Pronome,True,2025-05-27 00:49:01.547000+00:00,5-ensino-fundamental,"[{'tag': 'Verbo', 'word': 'Será'}, {'tag': 'Co...",new-session,Será que alguém explicou o que aconteceu na au...,Não
1,Verbo,Será,interaction,1,xqEk5ZfKaG,Conjunção Subordinativa,True,2025-05-27 00:49:01.547000+00:00,5-ensino-fundamental,"[{'tag': 'Verbo', 'word': 'Será'}, {'tag': 'Co...",new-session,Será que alguém explicou o que aconteceu na au...,Não
2,Pontuação,",",interaction,0,uQY36KP907,Pontuação,False,2025-05-27 01:01:42.509000+00:00,5-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !",Não
3,Pontuação,!,interaction,1,uQY36KP907,Pontuação,False,2025-05-27 01:01:42.509000+00:00,5-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !",Não
4,Substantivo,árvore,interaction,0,X2RK9EgWWZ,Substantivo,False,2025-05-27 11:11:54.325000+00:00,6-ensino-fundamental,"[{'tag': 'Artigo', 'word': 'A'}, {'tag': 'Subs...",new-session,A folha caiu da árvore .,Não
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24810,Pontuação,",",interaction,0,Halvf7QSu2,Pontuação,False,2025-06-03 16:48:19.059000+00:00,9-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !",Não
24811,Pontuação,!,interaction,1,Halvf7QSu2,Pontuação,False,2025-06-03 16:48:19.059000+00:00,9-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !",Não
24812,Verbo de Ligação,fosse,interaction,2,Halvf7QSu2,Verbo,True,2025-06-03 16:48:19.059000+00:00,9-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !",Não
24813,Verbo,chegar,interaction,3,Halvf7QSu2,Verbo de Ligação,True,2025-06-03 16:48:19.059000+00:00,9-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !",Não


In [23]:
df.to_pickle("merged_data_250603.pkl")

## Dataset maior (03-06-2025)

In [17]:
df = pd.read_csv("final_df_250603.csv")
df

Unnamed: 0,correctTag,targetWord,orderingId,sessionId,draggedTag,serie
0,Pontuação,.,0,Jh1538yCDy,Pontuação,6-ensino-fundamental
1,Artigo,O,1,Jh1538yCDy,Substantivo,6-ensino-fundamental
2,Substantivo,banco,2,Jh1538yCDy,Artigo,6-ensino-fundamental
3,Substantivo,banco,3,Jh1538yCDy,Substantivo,6-ensino-fundamental
4,Artigo,O,4,Jh1538yCDy,Artigo,6-ensino-fundamental
...,...,...,...,...,...,...
21941,Preposição+Artigo,à,3,wb93AUnw5X,Preposição,7-ensino-fundamental
21942,Preposição+Artigo,à,4,wb93AUnw5X,Preposição+Artigo,7-ensino-fundamental
21943,Verbo,foi,5,wb93AUnw5X,Verbo,7-ensino-fundamental
21944,Substantivo,escola,6,wb93AUnw5X,Substantivo,7-ensino-fundamental


In [18]:
# Cria coluna indicando se houve erro de etiquetagem por parte do aluno
def check_for_mistakes(row):
    return row['correctTag'] != row['draggedTag']

df['isMistake'] = df.apply(check_for_mistakes, axis=1)
df

Unnamed: 0,correctTag,targetWord,orderingId,sessionId,draggedTag,serie,isMistake
0,Pontuação,.,0,Jh1538yCDy,Pontuação,6-ensino-fundamental,False
1,Artigo,O,1,Jh1538yCDy,Substantivo,6-ensino-fundamental,True
2,Substantivo,banco,2,Jh1538yCDy,Artigo,6-ensino-fundamental,True
3,Substantivo,banco,3,Jh1538yCDy,Substantivo,6-ensino-fundamental,False
4,Artigo,O,4,Jh1538yCDy,Artigo,6-ensino-fundamental,False
...,...,...,...,...,...,...,...
21941,Preposição+Artigo,à,3,wb93AUnw5X,Preposição,7-ensino-fundamental,True
21942,Preposição+Artigo,à,4,wb93AUnw5X,Preposição+Artigo,7-ensino-fundamental,False
21943,Verbo,foi,5,wb93AUnw5X,Verbo,7-ensino-fundamental,False
21944,Substantivo,escola,6,wb93AUnw5X,Substantivo,7-ensino-fundamental,False


In [19]:
df.isMistake.value_counts()

True     11830
False    10116
Name: isMistake, dtype: int64

In [20]:
df.isMistake.value_counts(True)

True     0.53905
False    0.46095
Name: isMistake, dtype: float64

In [21]:
# # Criando coluna com as frases originais em si - CAMPO FALTANTE: IGNORANDO
# df['words'] = df['words'].progress_apply(lambda x: json.loads(x.replace("'", '"')))
# df['sentence'] = df['words'].progress_apply(lambda x: ' '.join([token_info['word'] for token_info in x]))

In [22]:
# Impossível criar a coluna "complete_try" também devido a campos faltantes

In [23]:
df.serie.value_counts()

5-ensino-fundamental    6305
7-ensino-fundamental    4690
6-ensino-fundamental    4318
8-ensino-fundamental    3593
9-ensino-fundamental    1672
4-ensino-fundamental     594
3-ensino-medio           508
1-ensino-medio           199
2-ensino-medio            67
Name: serie, dtype: int64

In [24]:
df.to_pickle("final_df_250603.pkl")

# Montando dataset
features importantes:

- targetWord
- correctTag
- draggedTag
- isMistake
- sentence
- orderingId

Target:
- serie


## Dataset inicial

In [24]:
df = pd.read_pickle('merged_data_250603.pkl')
df

Unnamed: 0,correctTag,targetWord,eventType_x,orderingId,sessionId,draggedTag,isMistake,startTime,serie,words,eventType_y,sentence,complete_try
0,Verbo,Será,interaction,0,xqEk5ZfKaG,Pronome,True,2025-05-27 00:49:01.547000+00:00,5-ensino-fundamental,"[{'tag': 'Verbo', 'word': 'Será'}, {'tag': 'Co...",new-session,Será que alguém explicou o que aconteceu na au...,Não
1,Verbo,Será,interaction,1,xqEk5ZfKaG,Conjunção Subordinativa,True,2025-05-27 00:49:01.547000+00:00,5-ensino-fundamental,"[{'tag': 'Verbo', 'word': 'Será'}, {'tag': 'Co...",new-session,Será que alguém explicou o que aconteceu na au...,Não
2,Pontuação,",",interaction,0,uQY36KP907,Pontuação,False,2025-05-27 01:01:42.509000+00:00,5-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !",Não
3,Pontuação,!,interaction,1,uQY36KP907,Pontuação,False,2025-05-27 01:01:42.509000+00:00,5-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !",Não
4,Substantivo,árvore,interaction,0,X2RK9EgWWZ,Substantivo,False,2025-05-27 11:11:54.325000+00:00,6-ensino-fundamental,"[{'tag': 'Artigo', 'word': 'A'}, {'tag': 'Subs...",new-session,A folha caiu da árvore .,Não
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24810,Pontuação,",",interaction,0,Halvf7QSu2,Pontuação,False,2025-06-03 16:48:19.059000+00:00,9-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !",Não
24811,Pontuação,!,interaction,1,Halvf7QSu2,Pontuação,False,2025-06-03 16:48:19.059000+00:00,9-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !",Não
24812,Verbo de Ligação,fosse,interaction,2,Halvf7QSu2,Verbo,True,2025-06-03 16:48:19.059000+00:00,9-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !",Não
24813,Verbo,chegar,interaction,3,Halvf7QSu2,Verbo de Ligação,True,2025-06-03 16:48:19.059000+00:00,9-ensino-fundamental,"[{'tag': 'Nome Próprio', 'word': 'Ufa'}, {'tag...",new-session,"Ufa , achei que não fosse chegar a tempo !",Não


In [25]:
model_inputs = []
targets = []
for sessionId in tqdm(df.sessionId.unique()):
    df_filtered = df[df.sessionId == sessionId].reset_index(drop=True)
    targets.append(df_filtered.loc[0, 'serie'])

    model_input = f"Frase original: \"{df_filtered.loc[0, 'sentence']}\"\nTentativa completa: {df_filtered.loc[0, 'complete_try']}. Interações do aluno:\n"
    for index, row in df_filtered.iterrows():
        model_input += (
            f"Palavra: {row['targetWord']}. "
            f"Tag correta: {row['correctTag']}. "
            f"Tag escolhida: {row['draggedTag']}. "
            f"Erro: {'sim' if row['isMistake'] else 'não'}. "
            f"Ordem da interação: {row['orderingId'] + 1}.\n"
        )

    model_input += "Fim."
    model_inputs.append(model_input)

dataset = pd.DataFrame()
dataset['input'] = model_inputs
dataset['target'] = targets

100%|██████████| 1431/1431 [00:03<00:00, 433.79it/s]


In [26]:
dataset

Unnamed: 0,input,target
0,"Frase original: ""Será que alguém explicou o qu...",5-ensino-fundamental
1,"Frase original: ""Ufa , achei que não fosse che...",5-ensino-fundamental
2,"Frase original: ""A folha caiu da árvore .""\nTe...",6-ensino-fundamental
3,"Frase original: ""É importante que todos os alu...",6-ensino-fundamental
4,"Frase original: ""Não acredito que você tenha f...",9-ensino-fundamental
...,...,...
1426,"Frase original: ""Eu acho que ele não vai gosta...",7-ensino-fundamental
1427,"Frase original: ""É importante que todos os alu...",7-ensino-fundamental
1428,"Frase original: ""O diretor aprovou a ideia del...",7-ensino-fundamental
1429,"Frase original: ""Maria foi à escola com João ....",7-ensino-fundamental


In [27]:
dataset.target.value_counts()

5-ensino-fundamental    310
7-ensino-fundamental    308
8-ensino-fundamental    278
6-ensino-fundamental    253
9-ensino-fundamental    177
3-ensino-medio           44
4-ensino-fundamental     40
1-ensino-medio           15
2-ensino-medio            6
Name: target, dtype: int64

In [28]:
# Filtrando ensino médio (por enquanto)
dataset = dataset[(dataset.target != '1-ensino-medio')]
dataset = dataset[(dataset.target != '2-ensino-medio')]
# dataset = dataset[(dataset.target != '3-ensino-medio')]
dataset = dataset.reset_index(drop=True)
dataset

Unnamed: 0,input,target
0,"Frase original: ""Será que alguém explicou o qu...",5-ensino-fundamental
1,"Frase original: ""Ufa , achei que não fosse che...",5-ensino-fundamental
2,"Frase original: ""A folha caiu da árvore .""\nTe...",6-ensino-fundamental
3,"Frase original: ""É importante que todos os alu...",6-ensino-fundamental
4,"Frase original: ""Não acredito que você tenha f...",9-ensino-fundamental
...,...,...
1405,"Frase original: ""Eu acho que ele não vai gosta...",7-ensino-fundamental
1406,"Frase original: ""É importante que todos os alu...",7-ensino-fundamental
1407,"Frase original: ""O diretor aprovou a ideia del...",7-ensino-fundamental
1408,"Frase original: ""Maria foi à escola com João ....",7-ensino-fundamental


In [29]:
le = preprocessing.LabelEncoder()
le.fit(dataset.target)
dataset['label'] = le.transform(dataset.target)
dataset

Unnamed: 0,input,target,label
0,"Frase original: ""Será que alguém explicou o qu...",5-ensino-fundamental,2
1,"Frase original: ""Ufa , achei que não fosse che...",5-ensino-fundamental,2
2,"Frase original: ""A folha caiu da árvore .""\nTe...",6-ensino-fundamental,3
3,"Frase original: ""É importante que todos os alu...",6-ensino-fundamental,3
4,"Frase original: ""Não acredito que você tenha f...",9-ensino-fundamental,6
...,...,...,...
1405,"Frase original: ""Eu acho que ele não vai gosta...",7-ensino-fundamental,4
1406,"Frase original: ""É importante que todos os alu...",7-ensino-fundamental,4
1407,"Frase original: ""O diretor aprovou a ideia del...",7-ensino-fundamental,4
1408,"Frase original: ""Maria foi à escola com João ....",7-ensino-fundamental,4


In [30]:
dataset.to_pickle('dataset_250603.pkl')
joblib.dump(le, 'label_encoder_250603.joblib')

['label_encoder_250603.joblib']

## Dataset maior (03-06-2025)

In [25]:
df = pd.read_pickle('final_df_250603.pkl')
df

Unnamed: 0,correctTag,targetWord,orderingId,sessionId,draggedTag,serie,isMistake
0,Pontuação,.,0,Jh1538yCDy,Pontuação,6-ensino-fundamental,False
1,Artigo,O,1,Jh1538yCDy,Substantivo,6-ensino-fundamental,True
2,Substantivo,banco,2,Jh1538yCDy,Artigo,6-ensino-fundamental,True
3,Substantivo,banco,3,Jh1538yCDy,Substantivo,6-ensino-fundamental,False
4,Artigo,O,4,Jh1538yCDy,Artigo,6-ensino-fundamental,False
...,...,...,...,...,...,...,...
21941,Preposição+Artigo,à,3,wb93AUnw5X,Preposição,7-ensino-fundamental,True
21942,Preposição+Artigo,à,4,wb93AUnw5X,Preposição+Artigo,7-ensino-fundamental,False
21943,Verbo,foi,5,wb93AUnw5X,Verbo,7-ensino-fundamental,False
21944,Substantivo,escola,6,wb93AUnw5X,Substantivo,7-ensino-fundamental,False


In [26]:
model_inputs = []
targets = []
for sessionId in tqdm(df.sessionId.unique()):
    df_filtered = df[df.sessionId == sessionId].reset_index(drop=True)
    targets.append(df_filtered.loc[0, 'serie'])

    # Entrada original com campos faltantes aqui:
    # model_input = f"Frase original: \"{df_filtered.loc[0, 'sentence']}\"\nTentativa completa: {df_filtered.loc[0, 'complete_try']}. Interações do aluno:\n"
    model_input = f"Interações do aluno:\n"
    for index, row in df_filtered.iterrows():
        model_input += (
            f"Palavra: {row['targetWord']}. "
            f"Tag correta: {row['correctTag']}. "
            f"Tag escolhida: {row['draggedTag']}. "
            f"Erro: {'sim' if row['isMistake'] else 'não'}. "
            f"Ordem da interação: {row['orderingId'] + 1}.\n"
        )

    model_input += "Fim."
    model_inputs.append(model_input)

dataset = pd.DataFrame()
dataset['input'] = model_inputs
dataset['target'] = targets

100%|██████████| 1113/1113 [00:02<00:00, 450.70it/s]


In [27]:
dataset

Unnamed: 0,input,target
0,Interações do aluno:\nPalavra: .. Tag correta:...,6-ensino-fundamental
1,Interações do aluno:\nPalavra: A. Tag correta:...,9-ensino-fundamental
2,Interações do aluno:\nPalavra: O. Tag correta:...,9-ensino-fundamental
3,Interações do aluno:\nPalavra: sinceramente. T...,9-ensino-fundamental
4,Interações do aluno:\nPalavra: tirou. Tag corr...,9-ensino-fundamental
...,...,...
1108,Interações do aluno:\nPalavra: .. Tag correta:...,9-ensino-fundamental
1109,Interações do aluno:\nPalavra: flores. Tag cor...,8-ensino-fundamental
1110,Interações do aluno:\nPalavra: ele. Tag corret...,7-ensino-fundamental
1111,Interações do aluno:\nPalavra: .. Tag correta:...,7-ensino-fundamental


In [28]:
dataset.target.value_counts()

5-ensino-fundamental    276
7-ensino-fundamental    247
8-ensino-fundamental    210
6-ensino-fundamental    198
9-ensino-fundamental    108
4-ensino-fundamental     29
3-ensino-medio           29
1-ensino-medio           12
2-ensino-medio            4
Name: target, dtype: int64

In [29]:
# Filtrando ensino médio (por enquanto)
dataset = dataset[(dataset.target != '1-ensino-medio')]
dataset = dataset[(dataset.target != '2-ensino-medio')]
dataset = dataset[(dataset.target != '3-ensino-medio')]
dataset = dataset.reset_index(drop=True)
dataset

Unnamed: 0,input,target
0,Interações do aluno:\nPalavra: .. Tag correta:...,6-ensino-fundamental
1,Interações do aluno:\nPalavra: A. Tag correta:...,9-ensino-fundamental
2,Interações do aluno:\nPalavra: O. Tag correta:...,9-ensino-fundamental
3,Interações do aluno:\nPalavra: sinceramente. T...,9-ensino-fundamental
4,Interações do aluno:\nPalavra: tirou. Tag corr...,9-ensino-fundamental
...,...,...
1063,Interações do aluno:\nPalavra: .. Tag correta:...,9-ensino-fundamental
1064,Interações do aluno:\nPalavra: flores. Tag cor...,8-ensino-fundamental
1065,Interações do aluno:\nPalavra: ele. Tag corret...,7-ensino-fundamental
1066,Interações do aluno:\nPalavra: .. Tag correta:...,7-ensino-fundamental


In [30]:
le = preprocessing.LabelEncoder()
le.fit(dataset.target)
dataset['label'] = le.transform(dataset.target)
dataset

Unnamed: 0,input,target,label
0,Interações do aluno:\nPalavra: .. Tag correta:...,6-ensino-fundamental,2
1,Interações do aluno:\nPalavra: A. Tag correta:...,9-ensino-fundamental,5
2,Interações do aluno:\nPalavra: O. Tag correta:...,9-ensino-fundamental,5
3,Interações do aluno:\nPalavra: sinceramente. T...,9-ensino-fundamental,5
4,Interações do aluno:\nPalavra: tirou. Tag corr...,9-ensino-fundamental,5
...,...,...,...
1063,Interações do aluno:\nPalavra: .. Tag correta:...,9-ensino-fundamental,5
1064,Interações do aluno:\nPalavra: flores. Tag cor...,8-ensino-fundamental,4
1065,Interações do aluno:\nPalavra: ele. Tag corret...,7-ensino-fundamental,3
1066,Interações do aluno:\nPalavra: .. Tag correta:...,7-ensino-fundamental,3


In [31]:
dataset.to_pickle('dataset_new.pkl')
joblib.dump(le, 'label_encoder_new.joblib')

['label_encoder_new.joblib']

# Instanciando modelo

In [31]:
df = pd.read_pickle('dataset_250603.pkl')
df

Unnamed: 0,input,target,label
0,"Frase original: ""Será que alguém explicou o qu...",5-ensino-fundamental,2
1,"Frase original: ""Ufa , achei que não fosse che...",5-ensino-fundamental,2
2,"Frase original: ""A folha caiu da árvore .""\nTe...",6-ensino-fundamental,3
3,"Frase original: ""É importante que todos os alu...",6-ensino-fundamental,3
4,"Frase original: ""Não acredito que você tenha f...",9-ensino-fundamental,6
...,...,...,...
1405,"Frase original: ""Eu acho que ele não vai gosta...",7-ensino-fundamental,4
1406,"Frase original: ""É importante que todos os alu...",7-ensino-fundamental,4
1407,"Frase original: ""O diretor aprovou a ideia del...",7-ensino-fundamental,4
1408,"Frase original: ""Maria foi à escola com João ....",7-ensino-fundamental,4


In [32]:
df.target.value_counts()

5-ensino-fundamental    310
7-ensino-fundamental    308
8-ensino-fundamental    278
6-ensino-fundamental    253
9-ensino-fundamental    177
3-ensino-medio           44
4-ensino-fundamental     40
Name: target, dtype: int64

In [33]:
df.label.value_counts()

2    310
4    308
5    278
3    253
6    177
0     44
1     40
Name: label, dtype: int64

In [34]:
df.label.value_counts(True)

2    0.219858
4    0.218440
5    0.197163
3    0.179433
6    0.125532
0    0.031206
1    0.028369
Name: label, dtype: float64

In [35]:
dataset = Dataset.from_pandas(df[['input', 'label']])
dataset = dataset.train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'label'],
        num_rows: 1128
    })
    test: Dataset({
        features: ['input', 'label'],
        num_rows: 282
    })
})

In [36]:
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

def tokenize(example):
    return tokenizer(example['input'], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/1128 [00:00<?, ? examples/s]

Map:   0%|          | 0/282 [00:00<?, ? examples/s]

In [37]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1128
    })
    test: Dataset({
        features: ['input', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 282
    })
})

In [38]:
joblib.dump(tokenized_dataset, "tokenized_dataset_250603.joblib")

['tokenized_dataset_250603.joblib']

In [39]:
num_labels = df['label'].nunique()

model = AutoModelForSequenceClassification.from_pretrained(
    "neuralmind/bert-base-portuguese-cased",
    num_labels=num_labels
)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

In [40]:
import evaluate
import torch
from transformers import TrainerCallback, TrainingArguments, Trainer
from transformers.utils.notebook import NotebookProgressCallback

# Métrica de avaliação
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

# Callback para salvar o melhor modelo com base na acurácia
class EvalAndSaveBestModelCallback(TrainerCallback):
    def __init__(self, save_path):
        self.save_path = save_path
        self.best_accuracy = 0.0

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        acc = metrics.get("eval_accuracy", 0.0)
        loss = metrics.get("eval_loss", None)

        print(f"\n→ Evaluation at epoch {int(state.epoch)}")
        print(f"Evaluation Accuracy: {acc:.4f}")
        if loss is not None:
            print(f"Evaluation Loss: {loss:.4f}")

        if acc > self.best_accuracy:
            print(f"→ New best accuracy! Saving model to {self.save_path}")
            self.best_accuracy = acc
            self.trainer.save_model(self.save_path)
            self.trainer.tokenizer.save_pretrained(self.save_path)

# Caminho para salvar o melhor modelo
save_best_model_path = "./results/best_model_250603"
eval_callback = EvalAndSaveBestModelCallback(save_path=save_best_model_path)

# Argumentos de treinamento
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",
    evaluation_strategy="epoch",  
    save_strategy="no",           
)

# Inicializa o Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[eval_callback],
)

# Vincula trainer ao callback
eval_callback.trainer = trainer

# Inicia o treinamento
trainer.train()


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.769915,0.198582
2,No log,1.710688,0.283688
3,No log,1.6851,0.322695
4,No log,1.667442,0.340426
5,No log,1.665699,0.326241
6,No log,1.695517,0.315603
7,No log,1.702761,0.315603
8,No log,1.725819,0.308511
9,No log,1.770172,0.312057
10,No log,1.804084,0.287234



→ Evaluation at epoch 1
Evaluation Accuracy: 0.1986
Evaluation Loss: 1.7699
→ New best accuracy! Saving model to ./results/best_model_250603





→ Evaluation at epoch 2
Evaluation Accuracy: 0.2837
Evaluation Loss: 1.7107
→ New best accuracy! Saving model to ./results/best_model_250603





→ Evaluation at epoch 3
Evaluation Accuracy: 0.3227
Evaluation Loss: 1.6851
→ New best accuracy! Saving model to ./results/best_model_250603





→ Evaluation at epoch 4
Evaluation Accuracy: 0.3404
Evaluation Loss: 1.6674
→ New best accuracy! Saving model to ./results/best_model_250603





→ Evaluation at epoch 5
Evaluation Accuracy: 0.3262
Evaluation Loss: 1.6657





→ Evaluation at epoch 6
Evaluation Accuracy: 0.3156
Evaluation Loss: 1.6955





→ Evaluation at epoch 7
Evaluation Accuracy: 0.3156
Evaluation Loss: 1.7028





→ Evaluation at epoch 8
Evaluation Accuracy: 0.3085
Evaluation Loss: 1.7258





→ Evaluation at epoch 9
Evaluation Accuracy: 0.3121
Evaluation Loss: 1.7702





→ Evaluation at epoch 10
Evaluation Accuracy: 0.2872
Evaluation Loss: 1.8041





→ Evaluation at epoch 11
Evaluation Accuracy: 0.3050
Evaluation Loss: 1.8381





→ Evaluation at epoch 12
Evaluation Accuracy: 0.3298
Evaluation Loss: 1.8900





→ Evaluation at epoch 13
Evaluation Accuracy: 0.3262
Evaluation Loss: 1.9018





→ Evaluation at epoch 14
Evaluation Accuracy: 0.3191
Evaluation Loss: 1.9979





→ Evaluation at epoch 15
Evaluation Accuracy: 0.3121
Evaluation Loss: 2.0661





→ Evaluation at epoch 16
Evaluation Accuracy: 0.3156
Evaluation Loss: 2.1177





→ Evaluation at epoch 17
Evaluation Accuracy: 0.3121
Evaluation Loss: 2.1802





→ Evaluation at epoch 18
Evaluation Accuracy: 0.3156
Evaluation Loss: 2.2398





→ Evaluation at epoch 19
Evaluation Accuracy: 0.2943
Evaluation Loss: 2.3140





→ Evaluation at epoch 20
Evaluation Accuracy: 0.3014
Evaluation Loss: 2.3065





→ Evaluation at epoch 21
Evaluation Accuracy: 0.3085
Evaluation Loss: 2.3920





→ Evaluation at epoch 22
Evaluation Accuracy: 0.3121
Evaluation Loss: 2.4893





→ Evaluation at epoch 23
Evaluation Accuracy: 0.2979
Evaluation Loss: 2.4981





→ Evaluation at epoch 24
Evaluation Accuracy: 0.3085
Evaluation Loss: 2.5827





→ Evaluation at epoch 25
Evaluation Accuracy: 0.3121
Evaluation Loss: 2.6100





→ Evaluation at epoch 26
Evaluation Accuracy: 0.3014
Evaluation Loss: 2.6738





→ Evaluation at epoch 27
Evaluation Accuracy: 0.3014
Evaluation Loss: 2.7590





→ Evaluation at epoch 28
Evaluation Accuracy: 0.2730
Evaluation Loss: 2.8059





→ Evaluation at epoch 29
Evaluation Accuracy: 0.2837
Evaluation Loss: 2.8284





→ Evaluation at epoch 30
Evaluation Accuracy: 0.2801
Evaluation Loss: 2.8719





→ Evaluation at epoch 31
Evaluation Accuracy: 0.2695
Evaluation Loss: 2.9620





→ Evaluation at epoch 32
Evaluation Accuracy: 0.2766
Evaluation Loss: 2.9627





→ Evaluation at epoch 33
Evaluation Accuracy: 0.2589
Evaluation Loss: 3.0053





→ Evaluation at epoch 34
Evaluation Accuracy: 0.2730
Evaluation Loss: 3.0782





→ Evaluation at epoch 35
Evaluation Accuracy: 0.2730
Evaluation Loss: 3.1486





→ Evaluation at epoch 36
Evaluation Accuracy: 0.2624
Evaluation Loss: 3.0899





→ Evaluation at epoch 37
Evaluation Accuracy: 0.2624
Evaluation Loss: 3.2279





→ Evaluation at epoch 38
Evaluation Accuracy: 0.2589
Evaluation Loss: 3.2775





→ Evaluation at epoch 39
Evaluation Accuracy: 0.2695
Evaluation Loss: 3.2565





→ Evaluation at epoch 40
Evaluation Accuracy: 0.2695
Evaluation Loss: 3.2561





→ Evaluation at epoch 41
Evaluation Accuracy: 0.2730
Evaluation Loss: 3.3064





→ Evaluation at epoch 42
Evaluation Accuracy: 0.2624
Evaluation Loss: 3.2756





→ Evaluation at epoch 43
Evaluation Accuracy: 0.2589
Evaluation Loss: 3.2849





→ Evaluation at epoch 44
Evaluation Accuracy: 0.2695
Evaluation Loss: 3.3544





→ Evaluation at epoch 45
Evaluation Accuracy: 0.2660
Evaluation Loss: 3.3956





→ Evaluation at epoch 46
Evaluation Accuracy: 0.2589
Evaluation Loss: 3.3707





→ Evaluation at epoch 47
Evaluation Accuracy: 0.2766
Evaluation Loss: 3.4172





→ Evaluation at epoch 48
Evaluation Accuracy: 0.2695
Evaluation Loss: 3.3979





→ Evaluation at epoch 49
Evaluation Accuracy: 0.2730
Evaluation Loss: 3.3938





→ Evaluation at epoch 50
Evaluation Accuracy: 0.2730
Evaluation Loss: 3.3965


TrainOutput(global_step=1800, training_loss=0.6856089952256944, metrics={'train_runtime': 262.0726, 'train_samples_per_second': 215.208, 'train_steps_per_second': 6.868, 'total_flos': 3710032427520000.0, 'train_loss': 0.6856089952256944, 'epoch': 50.0})

## Kfold

In [46]:
import pandas as pd
import torch
from sklearn.model_selection import KFold
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
)
from datasets import Dataset
import evaluate
import os

# Carrega o dataset
df = pd.read_pickle('dataset_250603.pkl')

# Tokenizador
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

def tokenize(example):
    return tokenizer(example['input'], truncation=True, padding="max_length", max_length=128)

# Métrica
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

# Callback personalizado para salvar o melhor modelo
class EvalAndSaveBestModelCallback(TrainerCallback):
    def __init__(self, save_path):
        self.save_path = save_path
        self.best_accuracy = 0.0

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        acc = metrics.get("eval_accuracy", 0.0)
        loss = metrics.get("eval_loss", None)

        print(f"\n→ Evaluation at epoch {int(state.epoch)}")
        print(f"Evaluation Accuracy: {acc:.4f}")
        if loss is not None:
            print(f"Evaluation Loss: {loss:.4f}")

        if acc > self.best_accuracy:
            print(f"→ New best accuracy! Saving model to {self.save_path}")
            self.best_accuracy = acc
            self.trainer.save_model(self.save_path)
            self.trainer.tokenizer.save_pretrained(self.save_path)

# Número de folds
k = 5
skf = KFold(n_splits=k, shuffle=True, random_state=42)

X = df['input'].values
y = df['label'].values
num_labels = df['label'].nunique()

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    print(f"\n==== Fold {fold + 1}/{k} ====")
    
    # Divide os dados
    train_df = df.iloc[train_idx].reset_index(drop=True)
    test_df = df.iloc[test_idx].reset_index(drop=True)
    
    # Converte para Hugging Face Dataset e tokeniza
    train_dataset = Dataset.from_pandas(train_df[['input', 'label']])
    test_dataset = Dataset.from_pandas(test_df[['input', 'label']])
    train_dataset = train_dataset.map(tokenize, batched=True)
    test_dataset = test_dataset.map(tokenize, batched=True)

    # Modelo
    model = AutoModelForSequenceClassification.from_pretrained(
        "neuralmind/bert-base-portuguese-cased",
        num_labels=num_labels
    )

    # Paths
    save_path = f"./results/best_model_250603/fold_{fold + 1}"
    os.makedirs(save_path, exist_ok=True)
    eval_callback = EvalAndSaveBestModelCallback(save_path=save_path)

    # Argumentos de treinamento
    training_args = TrainingArguments(
        output_dir=save_path,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=50,
        weight_decay=0.01,
        logging_dir=f"{save_path}/logs",
        report_to="none",
        evaluation_strategy="epoch",
        save_strategy="no",
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[eval_callback],
    )

    # Vincula trainer ao callback
    eval_callback.trainer = trainer

    # Treina o modelo
    trainer.train()



==== Fold 1/5 ====


Map:   0%|          | 0/1128 [00:00<?, ? examples/s]

Map:   0%|          | 0/282 [00:00<?, ? examples/s]

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.816923,0.212766
2,No log,1.790872,0.234043
3,No log,1.765982,0.276596
4,No log,1.725673,0.301418
5,No log,1.746532,0.29078
6,No log,1.769729,0.336879
7,No log,1.778082,0.322695
8,No log,1.811076,0.301418
9,No log,1.825087,0.283688
10,No log,1.876462,0.27305



→ Evaluation at epoch 1
Evaluation Accuracy: 0.2128
Evaluation Loss: 1.8169
→ New best accuracy! Saving model to ./results/best_model_250603/fold_1





→ Evaluation at epoch 2
Evaluation Accuracy: 0.2340
Evaluation Loss: 1.7909
→ New best accuracy! Saving model to ./results/best_model_250603/fold_1





→ Evaluation at epoch 3
Evaluation Accuracy: 0.2766
Evaluation Loss: 1.7660
→ New best accuracy! Saving model to ./results/best_model_250603/fold_1





→ Evaluation at epoch 4
Evaluation Accuracy: 0.3014
Evaluation Loss: 1.7257
→ New best accuracy! Saving model to ./results/best_model_250603/fold_1





→ Evaluation at epoch 5
Evaluation Accuracy: 0.2908
Evaluation Loss: 1.7465





→ Evaluation at epoch 6
Evaluation Accuracy: 0.3369
Evaluation Loss: 1.7697
→ New best accuracy! Saving model to ./results/best_model_250603/fold_1





→ Evaluation at epoch 7
Evaluation Accuracy: 0.3227
Evaluation Loss: 1.7781





→ Evaluation at epoch 8
Evaluation Accuracy: 0.3014
Evaluation Loss: 1.8111





→ Evaluation at epoch 9
Evaluation Accuracy: 0.2837
Evaluation Loss: 1.8251





→ Evaluation at epoch 10
Evaluation Accuracy: 0.2730
Evaluation Loss: 1.8765





→ Evaluation at epoch 11
Evaluation Accuracy: 0.2943
Evaluation Loss: 1.9226





→ Evaluation at epoch 12
Evaluation Accuracy: 0.2766
Evaluation Loss: 1.9515





→ Evaluation at epoch 13
Evaluation Accuracy: 0.2553
Evaluation Loss: 2.0686





→ Evaluation at epoch 14
Evaluation Accuracy: 0.2872
Evaluation Loss: 2.0823





→ Evaluation at epoch 15
Evaluation Accuracy: 0.2589
Evaluation Loss: 2.1536





→ Evaluation at epoch 16
Evaluation Accuracy: 0.2730
Evaluation Loss: 2.2067





→ Evaluation at epoch 17
Evaluation Accuracy: 0.2589
Evaluation Loss: 2.2531





→ Evaluation at epoch 18
Evaluation Accuracy: 0.2553
Evaluation Loss: 2.3038





→ Evaluation at epoch 19
Evaluation Accuracy: 0.2340
Evaluation Loss: 2.3945





→ Evaluation at epoch 20
Evaluation Accuracy: 0.2624
Evaluation Loss: 2.4218





→ Evaluation at epoch 21
Evaluation Accuracy: 0.2482
Evaluation Loss: 2.4660





→ Evaluation at epoch 22
Evaluation Accuracy: 0.2411
Evaluation Loss: 2.5354





→ Evaluation at epoch 23
Evaluation Accuracy: 0.2660
Evaluation Loss: 2.5441





→ Evaluation at epoch 24
Evaluation Accuracy: 0.2553
Evaluation Loss: 2.6085





→ Evaluation at epoch 25
Evaluation Accuracy: 0.2589
Evaluation Loss: 2.6302





→ Evaluation at epoch 26
Evaluation Accuracy: 0.2482
Evaluation Loss: 2.7272





→ Evaluation at epoch 27
Evaluation Accuracy: 0.2376
Evaluation Loss: 2.7655





→ Evaluation at epoch 28
Evaluation Accuracy: 0.2447
Evaluation Loss: 2.7762





→ Evaluation at epoch 29
Evaluation Accuracy: 0.2482
Evaluation Loss: 2.7978





→ Evaluation at epoch 30
Evaluation Accuracy: 0.2518
Evaluation Loss: 2.8651





→ Evaluation at epoch 31
Evaluation Accuracy: 0.2305
Evaluation Loss: 2.9129





→ Evaluation at epoch 32
Evaluation Accuracy: 0.2411
Evaluation Loss: 2.9792





→ Evaluation at epoch 33
Evaluation Accuracy: 0.2589
Evaluation Loss: 3.0012





→ Evaluation at epoch 34
Evaluation Accuracy: 0.2518
Evaluation Loss: 3.0424





→ Evaluation at epoch 35
Evaluation Accuracy: 0.2376
Evaluation Loss: 3.0322





→ Evaluation at epoch 36
Evaluation Accuracy: 0.2589
Evaluation Loss: 3.0813





→ Evaluation at epoch 37
Evaluation Accuracy: 0.2518
Evaluation Loss: 3.1663





→ Evaluation at epoch 38
Evaluation Accuracy: 0.2553
Evaluation Loss: 3.1435





→ Evaluation at epoch 39
Evaluation Accuracy: 0.2411
Evaluation Loss: 3.1887





→ Evaluation at epoch 40
Evaluation Accuracy: 0.2518
Evaluation Loss: 3.2273





→ Evaluation at epoch 41
Evaluation Accuracy: 0.2411
Evaluation Loss: 3.2020





→ Evaluation at epoch 42
Evaluation Accuracy: 0.2340
Evaluation Loss: 3.2035





→ Evaluation at epoch 43
Evaluation Accuracy: 0.2376
Evaluation Loss: 3.2666





→ Evaluation at epoch 44
Evaluation Accuracy: 0.2411
Evaluation Loss: 3.2569





→ Evaluation at epoch 45
Evaluation Accuracy: 0.2518
Evaluation Loss: 3.2510





→ Evaluation at epoch 46
Evaluation Accuracy: 0.2482
Evaluation Loss: 3.3083





→ Evaluation at epoch 47
Evaluation Accuracy: 0.2376
Evaluation Loss: 3.3161





→ Evaluation at epoch 48
Evaluation Accuracy: 0.2482
Evaluation Loss: 3.3061





→ Evaluation at epoch 49
Evaluation Accuracy: 0.2482
Evaluation Loss: 3.3042





→ Evaluation at epoch 50
Evaluation Accuracy: 0.2447
Evaluation Loss: 3.3109

==== Fold 2/5 ====


Map:   0%|          | 0/1128 [00:00<?, ? examples/s]

Map:   0%|          | 0/282 [00:00<?, ? examples/s]

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.771153,0.166667
2,No log,1.735921,0.287234
3,No log,1.750481,0.237589
4,No log,1.716604,0.280142
5,No log,1.696218,0.308511
6,No log,1.711199,0.308511
7,No log,1.735022,0.283688
8,No log,1.75191,0.265957
9,No log,1.824287,0.255319
10,No log,1.893992,0.241135



→ Evaluation at epoch 1
Evaluation Accuracy: 0.1667
Evaluation Loss: 1.7712
→ New best accuracy! Saving model to ./results/best_model_250603/fold_2





→ Evaluation at epoch 2
Evaluation Accuracy: 0.2872
Evaluation Loss: 1.7359
→ New best accuracy! Saving model to ./results/best_model_250603/fold_2





→ Evaluation at epoch 3
Evaluation Accuracy: 0.2376
Evaluation Loss: 1.7505





→ Evaluation at epoch 4
Evaluation Accuracy: 0.2801
Evaluation Loss: 1.7166





→ Evaluation at epoch 5
Evaluation Accuracy: 0.3085
Evaluation Loss: 1.6962
→ New best accuracy! Saving model to ./results/best_model_250603/fold_2





→ Evaluation at epoch 6
Evaluation Accuracy: 0.3085
Evaluation Loss: 1.7112





→ Evaluation at epoch 7
Evaluation Accuracy: 0.2837
Evaluation Loss: 1.7350





→ Evaluation at epoch 8
Evaluation Accuracy: 0.2660
Evaluation Loss: 1.7519





→ Evaluation at epoch 9
Evaluation Accuracy: 0.2553
Evaluation Loss: 1.8243





→ Evaluation at epoch 10
Evaluation Accuracy: 0.2411
Evaluation Loss: 1.8940





→ Evaluation at epoch 11
Evaluation Accuracy: 0.2589
Evaluation Loss: 1.9235





→ Evaluation at epoch 12
Evaluation Accuracy: 0.2837
Evaluation Loss: 1.9593





→ Evaluation at epoch 13
Evaluation Accuracy: 0.2553
Evaluation Loss: 2.0145





→ Evaluation at epoch 14
Evaluation Accuracy: 0.2411
Evaluation Loss: 2.1467





→ Evaluation at epoch 15
Evaluation Accuracy: 0.2411
Evaluation Loss: 2.1826





→ Evaluation at epoch 16
Evaluation Accuracy: 0.2447
Evaluation Loss: 2.2311





→ Evaluation at epoch 17
Evaluation Accuracy: 0.2695
Evaluation Loss: 2.3204





→ Evaluation at epoch 18
Evaluation Accuracy: 0.2376
Evaluation Loss: 2.4109





→ Evaluation at epoch 19
Evaluation Accuracy: 0.2128
Evaluation Loss: 2.5726





→ Evaluation at epoch 20
Evaluation Accuracy: 0.2376
Evaluation Loss: 2.5807





→ Evaluation at epoch 21
Evaluation Accuracy: 0.2305
Evaluation Loss: 2.7210





→ Evaluation at epoch 22
Evaluation Accuracy: 0.2021
Evaluation Loss: 2.8034





→ Evaluation at epoch 23
Evaluation Accuracy: 0.2482
Evaluation Loss: 2.7809





→ Evaluation at epoch 24
Evaluation Accuracy: 0.2270
Evaluation Loss: 2.9031





→ Evaluation at epoch 25
Evaluation Accuracy: 0.2482
Evaluation Loss: 2.9619





→ Evaluation at epoch 26
Evaluation Accuracy: 0.2092
Evaluation Loss: 3.0627





→ Evaluation at epoch 27
Evaluation Accuracy: 0.2234
Evaluation Loss: 3.0748





→ Evaluation at epoch 28
Evaluation Accuracy: 0.2376
Evaluation Loss: 3.1102





→ Evaluation at epoch 29
Evaluation Accuracy: 0.2447
Evaluation Loss: 3.2158





→ Evaluation at epoch 30
Evaluation Accuracy: 0.2411
Evaluation Loss: 3.2262





→ Evaluation at epoch 31
Evaluation Accuracy: 0.2447
Evaluation Loss: 3.3101





→ Evaluation at epoch 32
Evaluation Accuracy: 0.2234
Evaluation Loss: 3.3843





→ Evaluation at epoch 33
Evaluation Accuracy: 0.2305
Evaluation Loss: 3.3532





→ Evaluation at epoch 34
Evaluation Accuracy: 0.2092
Evaluation Loss: 3.4814





→ Evaluation at epoch 35
Evaluation Accuracy: 0.2128
Evaluation Loss: 3.5664





→ Evaluation at epoch 36
Evaluation Accuracy: 0.2057
Evaluation Loss: 3.5746





→ Evaluation at epoch 37
Evaluation Accuracy: 0.2305
Evaluation Loss: 3.6204





→ Evaluation at epoch 38
Evaluation Accuracy: 0.2163
Evaluation Loss: 3.6599





→ Evaluation at epoch 39
Evaluation Accuracy: 0.2270
Evaluation Loss: 3.6621





→ Evaluation at epoch 40
Evaluation Accuracy: 0.2234
Evaluation Loss: 3.6739





→ Evaluation at epoch 41
Evaluation Accuracy: 0.2163
Evaluation Loss: 3.7473





→ Evaluation at epoch 42
Evaluation Accuracy: 0.2092
Evaluation Loss: 3.7925





→ Evaluation at epoch 43
Evaluation Accuracy: 0.2305
Evaluation Loss: 3.7779





→ Evaluation at epoch 44
Evaluation Accuracy: 0.2057
Evaluation Loss: 3.8350





→ Evaluation at epoch 45
Evaluation Accuracy: 0.2128
Evaluation Loss: 3.8147





→ Evaluation at epoch 46
Evaluation Accuracy: 0.2092
Evaluation Loss: 3.8466





→ Evaluation at epoch 47
Evaluation Accuracy: 0.2057
Evaluation Loss: 3.8748





→ Evaluation at epoch 48
Evaluation Accuracy: 0.2057
Evaluation Loss: 3.8580





→ Evaluation at epoch 49
Evaluation Accuracy: 0.2128
Evaluation Loss: 3.8741





→ Evaluation at epoch 50
Evaluation Accuracy: 0.2092
Evaluation Loss: 3.8852

==== Fold 3/5 ====


Map:   0%|          | 0/1128 [00:00<?, ? examples/s]

Map:   0%|          | 0/282 [00:00<?, ? examples/s]

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.766199,0.27305
2,No log,1.772198,0.244681
3,No log,1.723583,0.27305
4,No log,1.750736,0.248227
5,No log,1.75723,0.258865
6,No log,1.799542,0.22695
7,No log,1.785802,0.230496
8,No log,1.887336,0.234043
9,No log,1.88976,0.212766
10,No log,1.952035,0.241135



→ Evaluation at epoch 1
Evaluation Accuracy: 0.2730
Evaluation Loss: 1.7662
→ New best accuracy! Saving model to ./results/best_model_250603/fold_3





→ Evaluation at epoch 2
Evaluation Accuracy: 0.2447
Evaluation Loss: 1.7722





→ Evaluation at epoch 3
Evaluation Accuracy: 0.2730
Evaluation Loss: 1.7236





→ Evaluation at epoch 4
Evaluation Accuracy: 0.2482
Evaluation Loss: 1.7507





→ Evaluation at epoch 5
Evaluation Accuracy: 0.2589
Evaluation Loss: 1.7572





→ Evaluation at epoch 6
Evaluation Accuracy: 0.2270
Evaluation Loss: 1.7995





→ Evaluation at epoch 7
Evaluation Accuracy: 0.2305
Evaluation Loss: 1.7858





→ Evaluation at epoch 8
Evaluation Accuracy: 0.2340
Evaluation Loss: 1.8873





→ Evaluation at epoch 9
Evaluation Accuracy: 0.2128
Evaluation Loss: 1.8898





→ Evaluation at epoch 10
Evaluation Accuracy: 0.2411
Evaluation Loss: 1.9520





→ Evaluation at epoch 11
Evaluation Accuracy: 0.2163
Evaluation Loss: 1.9739





→ Evaluation at epoch 12
Evaluation Accuracy: 0.2234
Evaluation Loss: 2.0404





→ Evaluation at epoch 13
Evaluation Accuracy: 0.2234
Evaluation Loss: 2.1195





→ Evaluation at epoch 14
Evaluation Accuracy: 0.2199
Evaluation Loss: 2.2156





→ Evaluation at epoch 15
Evaluation Accuracy: 0.2199
Evaluation Loss: 2.2272





→ Evaluation at epoch 16
Evaluation Accuracy: 0.2340
Evaluation Loss: 2.3097





→ Evaluation at epoch 17
Evaluation Accuracy: 0.2270
Evaluation Loss: 2.4369





→ Evaluation at epoch 18
Evaluation Accuracy: 0.2057
Evaluation Loss: 2.5284





→ Evaluation at epoch 19
Evaluation Accuracy: 0.2340
Evaluation Loss: 2.5181





→ Evaluation at epoch 20
Evaluation Accuracy: 0.2376
Evaluation Loss: 2.6198





→ Evaluation at epoch 21
Evaluation Accuracy: 0.2163
Evaluation Loss: 2.7449





→ Evaluation at epoch 22
Evaluation Accuracy: 0.2411
Evaluation Loss: 2.7672





→ Evaluation at epoch 23
Evaluation Accuracy: 0.2234
Evaluation Loss: 2.8185





→ Evaluation at epoch 24
Evaluation Accuracy: 0.2163
Evaluation Loss: 2.9433





→ Evaluation at epoch 25
Evaluation Accuracy: 0.1844
Evaluation Loss: 2.9713





→ Evaluation at epoch 26
Evaluation Accuracy: 0.2057
Evaluation Loss: 3.0252





→ Evaluation at epoch 27
Evaluation Accuracy: 0.2092
Evaluation Loss: 3.0394





→ Evaluation at epoch 28
Evaluation Accuracy: 0.2021
Evaluation Loss: 3.1585





→ Evaluation at epoch 29
Evaluation Accuracy: 0.2128
Evaluation Loss: 3.1590





→ Evaluation at epoch 30
Evaluation Accuracy: 0.2163
Evaluation Loss: 3.2858





→ Evaluation at epoch 31
Evaluation Accuracy: 0.2057
Evaluation Loss: 3.3501





→ Evaluation at epoch 32
Evaluation Accuracy: 0.1950
Evaluation Loss: 3.4103





→ Evaluation at epoch 33
Evaluation Accuracy: 0.2128
Evaluation Loss: 3.4249





→ Evaluation at epoch 34
Evaluation Accuracy: 0.1879
Evaluation Loss: 3.5436





→ Evaluation at epoch 35
Evaluation Accuracy: 0.2057
Evaluation Loss: 3.5444





→ Evaluation at epoch 36
Evaluation Accuracy: 0.1950
Evaluation Loss: 3.6176





→ Evaluation at epoch 37
Evaluation Accuracy: 0.2199
Evaluation Loss: 3.5831





→ Evaluation at epoch 38
Evaluation Accuracy: 0.1950
Evaluation Loss: 3.6262





→ Evaluation at epoch 39
Evaluation Accuracy: 0.2092
Evaluation Loss: 3.6838





→ Evaluation at epoch 40
Evaluation Accuracy: 0.2163
Evaluation Loss: 3.7389





→ Evaluation at epoch 41
Evaluation Accuracy: 0.2163
Evaluation Loss: 3.7476





→ Evaluation at epoch 42
Evaluation Accuracy: 0.2199
Evaluation Loss: 3.7597





→ Evaluation at epoch 43
Evaluation Accuracy: 0.1950
Evaluation Loss: 3.7971





→ Evaluation at epoch 44
Evaluation Accuracy: 0.1986
Evaluation Loss: 3.7740





→ Evaluation at epoch 45
Evaluation Accuracy: 0.1986
Evaluation Loss: 3.7983





→ Evaluation at epoch 46
Evaluation Accuracy: 0.1986
Evaluation Loss: 3.8030





→ Evaluation at epoch 47
Evaluation Accuracy: 0.2057
Evaluation Loss: 3.8039





→ Evaluation at epoch 48
Evaluation Accuracy: 0.2021
Evaluation Loss: 3.8318





→ Evaluation at epoch 49
Evaluation Accuracy: 0.2057
Evaluation Loss: 3.8344





→ Evaluation at epoch 50
Evaluation Accuracy: 0.2057
Evaluation Loss: 3.8327

==== Fold 4/5 ====


Map:   0%|          | 0/1128 [00:00<?, ? examples/s]

Map:   0%|          | 0/282 [00:00<?, ? examples/s]

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.750362,0.237589
2,No log,1.752954,0.22695
3,No log,1.734819,0.276596
4,No log,1.70238,0.283688
5,No log,1.729954,0.29078
6,No log,1.698324,0.29078
7,No log,1.711958,0.283688
8,No log,1.711419,0.276596
9,No log,1.72304,0.294326
10,No log,1.770352,0.29078



→ Evaluation at epoch 1
Evaluation Accuracy: 0.2376
Evaluation Loss: 1.7504
→ New best accuracy! Saving model to ./results/best_model_250603/fold_4





→ Evaluation at epoch 2
Evaluation Accuracy: 0.2270
Evaluation Loss: 1.7530





→ Evaluation at epoch 3
Evaluation Accuracy: 0.2766
Evaluation Loss: 1.7348
→ New best accuracy! Saving model to ./results/best_model_250603/fold_4





→ Evaluation at epoch 4
Evaluation Accuracy: 0.2837
Evaluation Loss: 1.7024
→ New best accuracy! Saving model to ./results/best_model_250603/fold_4





→ Evaluation at epoch 5
Evaluation Accuracy: 0.2908
Evaluation Loss: 1.7300
→ New best accuracy! Saving model to ./results/best_model_250603/fold_4





→ Evaluation at epoch 6
Evaluation Accuracy: 0.2908
Evaluation Loss: 1.6983





→ Evaluation at epoch 7
Evaluation Accuracy: 0.2837
Evaluation Loss: 1.7120





→ Evaluation at epoch 8
Evaluation Accuracy: 0.2766
Evaluation Loss: 1.7114





→ Evaluation at epoch 9
Evaluation Accuracy: 0.2943
Evaluation Loss: 1.7230
→ New best accuracy! Saving model to ./results/best_model_250603/fold_4





→ Evaluation at epoch 10
Evaluation Accuracy: 0.2908
Evaluation Loss: 1.7704





→ Evaluation at epoch 11
Evaluation Accuracy: 0.3085
Evaluation Loss: 1.8361
→ New best accuracy! Saving model to ./results/best_model_250603/fold_4





→ Evaluation at epoch 12
Evaluation Accuracy: 0.2908
Evaluation Loss: 1.8962





→ Evaluation at epoch 13
Evaluation Accuracy: 0.2837
Evaluation Loss: 1.9518





→ Evaluation at epoch 14
Evaluation Accuracy: 0.3085
Evaluation Loss: 2.0312





→ Evaluation at epoch 15
Evaluation Accuracy: 0.3014
Evaluation Loss: 2.1196





→ Evaluation at epoch 16
Evaluation Accuracy: 0.2801
Evaluation Loss: 2.1536





→ Evaluation at epoch 17
Evaluation Accuracy: 0.2695
Evaluation Loss: 2.2542





→ Evaluation at epoch 18
Evaluation Accuracy: 0.2766
Evaluation Loss: 2.3077





→ Evaluation at epoch 19
Evaluation Accuracy: 0.2589
Evaluation Loss: 2.3427





→ Evaluation at epoch 20
Evaluation Accuracy: 0.2624
Evaluation Loss: 2.4679





→ Evaluation at epoch 21
Evaluation Accuracy: 0.2624
Evaluation Loss: 2.5302





→ Evaluation at epoch 22
Evaluation Accuracy: 0.2447
Evaluation Loss: 2.6221





→ Evaluation at epoch 23
Evaluation Accuracy: 0.2553
Evaluation Loss: 2.6709





→ Evaluation at epoch 24
Evaluation Accuracy: 0.2376
Evaluation Loss: 2.7569





→ Evaluation at epoch 25
Evaluation Accuracy: 0.2447
Evaluation Loss: 2.8375





→ Evaluation at epoch 26
Evaluation Accuracy: 0.2589
Evaluation Loss: 2.8671





→ Evaluation at epoch 27
Evaluation Accuracy: 0.2553
Evaluation Loss: 2.9763





→ Evaluation at epoch 28
Evaluation Accuracy: 0.2234
Evaluation Loss: 3.0548





→ Evaluation at epoch 29
Evaluation Accuracy: 0.2305
Evaluation Loss: 3.0889





→ Evaluation at epoch 30
Evaluation Accuracy: 0.2234
Evaluation Loss: 3.1531





→ Evaluation at epoch 31
Evaluation Accuracy: 0.2340
Evaluation Loss: 3.1678





→ Evaluation at epoch 32
Evaluation Accuracy: 0.2340
Evaluation Loss: 3.2649





→ Evaluation at epoch 33
Evaluation Accuracy: 0.2234
Evaluation Loss: 3.2524





→ Evaluation at epoch 34
Evaluation Accuracy: 0.2305
Evaluation Loss: 3.3514





→ Evaluation at epoch 35
Evaluation Accuracy: 0.2163
Evaluation Loss: 3.3602





→ Evaluation at epoch 36
Evaluation Accuracy: 0.2128
Evaluation Loss: 3.4106





→ Evaluation at epoch 37
Evaluation Accuracy: 0.2305
Evaluation Loss: 3.4726





→ Evaluation at epoch 38
Evaluation Accuracy: 0.2163
Evaluation Loss: 3.5296





→ Evaluation at epoch 39
Evaluation Accuracy: 0.2234
Evaluation Loss: 3.5117





→ Evaluation at epoch 40
Evaluation Accuracy: 0.2163
Evaluation Loss: 3.5706





→ Evaluation at epoch 41
Evaluation Accuracy: 0.2270
Evaluation Loss: 3.5800





→ Evaluation at epoch 42
Evaluation Accuracy: 0.2163
Evaluation Loss: 3.6499





→ Evaluation at epoch 43
Evaluation Accuracy: 0.2021
Evaluation Loss: 3.7035





→ Evaluation at epoch 44
Evaluation Accuracy: 0.2234
Evaluation Loss: 3.6766





→ Evaluation at epoch 45
Evaluation Accuracy: 0.2163
Evaluation Loss: 3.7424





→ Evaluation at epoch 46
Evaluation Accuracy: 0.2021
Evaluation Loss: 3.7564





→ Evaluation at epoch 47
Evaluation Accuracy: 0.2128
Evaluation Loss: 3.7681





→ Evaluation at epoch 48
Evaluation Accuracy: 0.2057
Evaluation Loss: 3.7417





→ Evaluation at epoch 49
Evaluation Accuracy: 0.2057
Evaluation Loss: 3.7641





→ Evaluation at epoch 50
Evaluation Accuracy: 0.2092
Evaluation Loss: 3.7736

==== Fold 5/5 ====


Map:   0%|          | 0/1128 [00:00<?, ? examples/s]

Map:   0%|          | 0/282 [00:00<?, ? examples/s]

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.750319,0.258865
2,No log,1.728271,0.276596
3,No log,1.698971,0.283688
4,No log,1.702419,0.315603
5,No log,1.709484,0.319149
6,No log,1.834521,0.248227
7,No log,1.756551,0.280142
8,No log,1.802946,0.301418
9,No log,1.83076,0.276596
10,No log,1.800635,0.301418



→ Evaluation at epoch 1
Evaluation Accuracy: 0.2589
Evaluation Loss: 1.7503
→ New best accuracy! Saving model to ./results/best_model_250603/fold_5





→ Evaluation at epoch 2
Evaluation Accuracy: 0.2766
Evaluation Loss: 1.7283
→ New best accuracy! Saving model to ./results/best_model_250603/fold_5





→ Evaluation at epoch 3
Evaluation Accuracy: 0.2837
Evaluation Loss: 1.6990
→ New best accuracy! Saving model to ./results/best_model_250603/fold_5





→ Evaluation at epoch 4
Evaluation Accuracy: 0.3156
Evaluation Loss: 1.7024
→ New best accuracy! Saving model to ./results/best_model_250603/fold_5





→ Evaluation at epoch 5
Evaluation Accuracy: 0.3191
Evaluation Loss: 1.7095
→ New best accuracy! Saving model to ./results/best_model_250603/fold_5





→ Evaluation at epoch 6
Evaluation Accuracy: 0.2482
Evaluation Loss: 1.8345





→ Evaluation at epoch 7
Evaluation Accuracy: 0.2801
Evaluation Loss: 1.7566





→ Evaluation at epoch 8
Evaluation Accuracy: 0.3014
Evaluation Loss: 1.8029





→ Evaluation at epoch 9
Evaluation Accuracy: 0.2766
Evaluation Loss: 1.8308





→ Evaluation at epoch 10
Evaluation Accuracy: 0.3014
Evaluation Loss: 1.8006





→ Evaluation at epoch 11
Evaluation Accuracy: 0.2766
Evaluation Loss: 1.8573





→ Evaluation at epoch 12
Evaluation Accuracy: 0.2908
Evaluation Loss: 1.8922





→ Evaluation at epoch 13
Evaluation Accuracy: 0.2801
Evaluation Loss: 1.9891





→ Evaluation at epoch 14
Evaluation Accuracy: 0.2801
Evaluation Loss: 2.0060





→ Evaluation at epoch 15
Evaluation Accuracy: 0.2801
Evaluation Loss: 2.1134





→ Evaluation at epoch 16
Evaluation Accuracy: 0.2801
Evaluation Loss: 2.1736





→ Evaluation at epoch 17
Evaluation Accuracy: 0.2730
Evaluation Loss: 2.1940





→ Evaluation at epoch 18
Evaluation Accuracy: 0.2695
Evaluation Loss: 2.2857





→ Evaluation at epoch 19
Evaluation Accuracy: 0.2730
Evaluation Loss: 2.3373





→ Evaluation at epoch 20
Evaluation Accuracy: 0.2518
Evaluation Loss: 2.3797





→ Evaluation at epoch 21
Evaluation Accuracy: 0.2766
Evaluation Loss: 2.4339





→ Evaluation at epoch 22
Evaluation Accuracy: 0.2624
Evaluation Loss: 2.5422





→ Evaluation at epoch 23
Evaluation Accuracy: 0.2589
Evaluation Loss: 2.6153





→ Evaluation at epoch 24
Evaluation Accuracy: 0.2482
Evaluation Loss: 2.6772





→ Evaluation at epoch 25
Evaluation Accuracy: 0.2660
Evaluation Loss: 2.6888





→ Evaluation at epoch 26
Evaluation Accuracy: 0.2305
Evaluation Loss: 2.8023





→ Evaluation at epoch 27
Evaluation Accuracy: 0.2730
Evaluation Loss: 2.8656





→ Evaluation at epoch 28
Evaluation Accuracy: 0.2447
Evaluation Loss: 2.9379





→ Evaluation at epoch 29
Evaluation Accuracy: 0.2730
Evaluation Loss: 2.9356





→ Evaluation at epoch 30
Evaluation Accuracy: 0.2660
Evaluation Loss: 2.9985





→ Evaluation at epoch 31
Evaluation Accuracy: 0.2589
Evaluation Loss: 3.0789





→ Evaluation at epoch 32
Evaluation Accuracy: 0.2447
Evaluation Loss: 3.0927





→ Evaluation at epoch 33
Evaluation Accuracy: 0.2801
Evaluation Loss: 3.0953





→ Evaluation at epoch 34
Evaluation Accuracy: 0.2695
Evaluation Loss: 3.1630





→ Evaluation at epoch 35
Evaluation Accuracy: 0.2518
Evaluation Loss: 3.1719





→ Evaluation at epoch 36
Evaluation Accuracy: 0.2624
Evaluation Loss: 3.2300





→ Evaluation at epoch 37
Evaluation Accuracy: 0.2589
Evaluation Loss: 3.2652





→ Evaluation at epoch 38
Evaluation Accuracy: 0.2766
Evaluation Loss: 3.2549





→ Evaluation at epoch 39
Evaluation Accuracy: 0.2872
Evaluation Loss: 3.2983





→ Evaluation at epoch 40
Evaluation Accuracy: 0.2624
Evaluation Loss: 3.3547





→ Evaluation at epoch 41
Evaluation Accuracy: 0.2730
Evaluation Loss: 3.3734





→ Evaluation at epoch 42
Evaluation Accuracy: 0.2801
Evaluation Loss: 3.3804





→ Evaluation at epoch 43
Evaluation Accuracy: 0.2695
Evaluation Loss: 3.3741





→ Evaluation at epoch 44
Evaluation Accuracy: 0.2624
Evaluation Loss: 3.4118





→ Evaluation at epoch 45
Evaluation Accuracy: 0.2589
Evaluation Loss: 3.4339





→ Evaluation at epoch 46
Evaluation Accuracy: 0.2589
Evaluation Loss: 3.4553





→ Evaluation at epoch 47
Evaluation Accuracy: 0.2660
Evaluation Loss: 3.4746





→ Evaluation at epoch 48
Evaluation Accuracy: 0.2553
Evaluation Loss: 3.4830





→ Evaluation at epoch 49
Evaluation Accuracy: 0.2624
Evaluation Loss: 3.4796





→ Evaluation at epoch 50
Evaluation Accuracy: 0.2624
Evaluation Loss: 3.4825


# Evaluating

## Original model

In [47]:
tokenized_dataset = joblib.load("tokenized_dataset.joblib")
model_path = "./results/best_model"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), axis=-1)
    return accuracy.compute(predictions=preds, references=labels)
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results/eval",
    per_device_eval_batch_size=16,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Run evaluation on test set
metrics = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print(metrics)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 1.4698394536972046, 'eval_accuracy': 0.36627906976744184, 'eval_runtime': 0.3236, 'eval_samples_per_second': 531.465, 'eval_steps_per_second': 18.539}


## New model

In [48]:
tokenized_dataset = joblib.load("tokenized_dataset_new.joblib")
model_path = "./results/best_model_new"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), axis=-1)
    return accuracy.compute(predictions=preds, references=labels)
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results/eval",
    per_device_eval_batch_size=16,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Run evaluation on test set
metrics = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print(metrics)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 5.627876281738281, 'eval_accuracy': 0.3317757009345794, 'eval_runtime': 0.4083, 'eval_samples_per_second': 524.105, 'eval_steps_per_second': 17.144}


## 250603 model

In [49]:
tokenized_dataset = joblib.load("tokenized_dataset_250603.joblib")
model_path = "./results/best_model_250603"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), axis=-1)
    return accuracy.compute(predictions=preds, references=labels)
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results/eval",
    per_device_eval_batch_size=16,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Run evaluation on test set
metrics = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print(metrics)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 1.6674420833587646, 'eval_accuracy': 0.3404255319148936, 'eval_runtime': 0.4699, 'eval_samples_per_second': 600.167, 'eval_steps_per_second': 19.154}


## Kfold models

In [51]:
fold_paths = "/home/daniel/projects/pln/results/best_model_250603/fold_{fold}"
tokenized_dataset = joblib.load("tokenized_dataset_250603.joblib")

for i in range(1, 6):
    print(f"====== Fold {i} ======")
    model_path = fold_paths.format(fold=i)

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    accuracy = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = torch.argmax(torch.tensor(logits), axis=-1)
        return accuracy.compute(predictions=preds, references=labels)
    from transformers import Trainer, TrainingArguments

    training_args = TrainingArguments(
        output_dir="./results/eval",
        per_device_eval_batch_size=16,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Run evaluation on test set
    metrics = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
    print(metrics)



You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 1.5132287740707397, 'eval_accuracy': 0.4219858156028369, 'eval_runtime': 0.46, 'eval_samples_per_second': 613.047, 'eval_steps_per_second': 19.565}


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 1.5997364521026611, 'eval_accuracy': 0.35815602836879434, 'eval_runtime': 0.4635, 'eval_samples_per_second': 608.422, 'eval_steps_per_second': 19.418}


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 1.7352327108383179, 'eval_accuracy': 0.2872340425531915, 'eval_runtime': 0.4631, 'eval_samples_per_second': 609.0, 'eval_steps_per_second': 19.436}


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 1.256258249282837, 'eval_accuracy': 0.5460992907801419, 'eval_runtime': 0.6219, 'eval_samples_per_second': 453.443, 'eval_steps_per_second': 14.472}


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 1.553895354270935, 'eval_accuracy': 0.3900709219858156, 'eval_runtime': 0.4923, 'eval_samples_per_second': 572.877, 'eval_steps_per_second': 18.283}
