#Importações

In [None]:
import pandas as pd
import numpy as np
import torch

In [None]:
!pip -qqq install transformers
!pip -qqq install accelerate -U
!pip -qqq install datasets
!pip -qqq install evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric
import evaluate

#Definição de funções para conversão de datasets para question answering

In [None]:
def _create_qa_b_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Create a dataframe with the QA-B format."""
    rows = []

    for review, polarity, aspect in zip(df.review, df.polarity, df.aspect):
        question = ' '.join(["A polaridade de", aspect, "é"])

        # For each of the possible polarities, a new row will be created
        rows.append({
            'review': review,
            'question': ' '.join([question, 'positiva?']),
            'label': int(polarity == 1)
        })
        rows.append({
            'review': review,
            'question': ' '.join([question, 'neutra?']),
            'label': int(polarity == 0)
        })
        rows.append({
            'review': review,
            'question': ' '.join([question, 'negativa?']),
            'label': int(polarity == -1)
        })

    return pd.DataFrame(rows)


def _create_qa_m_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Create a dataframe with the QA-M format."""
    rows = []
    question = "Qual a polaridade de "

    for review, polarity, aspect in zip(df.review, df.polarity, df.aspect):
        rows.append({
            'review': review,
            'question': ''.join([question, aspect, '?']),
            'label': polarity
        })

    return pd.DataFrame(rows)


def _create_nli_b_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Create a dataframe with the NLI-B format."""
    rows = []

    for review, polarity, aspect in zip(df.review, df.polarity, df.aspect):
        # For each of the possible polarities, a new row will be created
        rows.append({
            'review': review,
            'question': ' '.join([aspect, ' - ', 'positivo']),
            'label': str(polarity == 1)
        })
        rows.append({
            'review': review,
            'question': ' '.join([aspect, ' - ', 'neutro']),
            'label': str(polarity == 0)
        })
        rows.append({
            'review': review,
            'question': ' '.join([aspect, ' - ', 'negativo']),
            'label': str(polarity == -1)
        })

    return pd.DataFrame(rows)


def _create_nli_m_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Create a dataframe with the QA-M format."""
    rows = []

    for review, polarity, aspect in zip(df.review, df.polarity, df.aspect):
        rows.append({
            'review': review,
            'question': aspect,
            'label': polarity
        })

    return pd.DataFrame(rows)

In [None]:
def create_dataframe(df: pd.DataFrame, format: str) -> pd.DataFrame:
    """Create a dataframe on the given format.

    Args:
        df: A pandas DataFrame containing the columns 'review', 'polarity'
            and 'aspect'. The polarity column must contain integers, all of them
            being 1, 0 or -1 (1 for positive, 0 for neutral and -1 for negative)

        format: The format of the output DataFrame.
            Must be one of ("qa-m", "qa-b", "nli-m", "nli-b")

    Returns:
        A pandas DataFrame containing the columns "review", "question" and "label".

        The "question" column will exist even on NLI formats.
        The "label" column will contain ("True" or "False") for `-b` formats,
            and (1, 0, -1) for `-m` formats.

    """

    match format:
        case "qa-m":
            return _create_qa_m_dataframe(df)
        case "qa-b":
            return _create_qa_b_dataframe(df)
        case "nli-m":
            return _create_nli_m_dataframe(df)
        case "nli-b":
            return _create_nli_b_dataframe(df)
        case _:
            raise ValueError("Invalid dataframe format.")

#Inicialização do tokenizer e do modelo para treino

In [None]:
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
model = BertForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased").to('cuda')

Downloading (…)okenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#Definição de funções auxiliares

In [None]:
# metric = evaluate.load("accuracy")
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Função de tokenização para QA-B

In [None]:
def tokenize_function(df):
    return tokenizer(df['review'], df['question'], padding='max_length', truncation=True, max_length=512)

#Carregando dataset de treino original

In [None]:
df_train_original = pd.read_csv('/content/drive/MyDrive/Arquivos de aula/TCC/Datasets/ABSAPT 2022 - Training.csv', sep=';', index_col= 0)

In [None]:
df_train_original

Unnamed: 0_level_0,review,polarity,aspect,start_position,end_position
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,O hotel é perto de todos os pontos principais ...,0,quarto,152,158
1,Viajamos eu e minha irmã. O hotel tem uma extr...,1,elevador,63,71
2,Estive por 8 dias hospedado neste hotel com mi...,1,café da manhã,209,222
3,Adorei a estadia. Porto Alegre foi sensacional...,-1,hotel,378,383
4,"O hotel tem ótima localização, fizemos vários ...",1,internet,216,224
...,...,...,...,...,...
3106,Reservei esse hotel através do pacote da TAM V...,-1,recepção,588,596
3107,Fomos muito bem atendido na chegada disponibil...,1,quarto,128,134
3108,"Excelente hotel, sem cassino e aquela confusao...",0,shopping,99,107
3109,"Prédio típico parisiense, traz um aspecto de v...",1,limpeza,83,90


#Carregando dataset de teste

In [None]:
df_test = pd.read_csv('', sep=';', index_col= 0)

In [None]:
df_test

Conversão do dataset para question answering

In [None]:
df_test_qa_b = create_dataframe(df_test, "qa-b")
df_test_qa_b = Dataset.from_pandas(df_test_qa_b)

In [None]:
tokenized_test = df_test_qa_b.map(tokenize_function, batched=True)

Map:   0%|          | 0/2058 [00:00<?, ? examples/s]

#Aplicações

##Avaliação de controle

In [None]:
df_train_qa_b = create_dataframe(df_train_original, "qa-b")
df_train_qa_b = Dataset.from_pandas(df_train_qa_b)

In [None]:
tokenized_train = df_train_qa_b.map(tokenize_function, batched=True)

Map:   0%|          | 0/9333 [00:00<?, ? examples/s]

In [None]:
train_labels = torch.tensor(tokenized_train['label'])

In [None]:
train_args = TrainingArguments(output_dir='/content/drive/MyDrive/Arquivos de aula/TCC/Treinamentos/control', evaluation_strategy='epoch')
train_trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_train,
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_train
)

In [None]:
train_trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4804,0.433151,0.827815,0.702352,0.828671,0.60945
2,0.4436,0.44937,0.800921,0.593079,0.930584,0.43523
3,0.3825,0.358183,0.848495,0.748845,0.83684,0.677596


TrainOutput(global_step=3501, training_loss=0.44352472984460517, metrics={'train_runtime': 3587.8408, 'train_samples_per_second': 7.804, 'train_steps_per_second': 0.976, 'total_flos': 7366846439024640.0, 'train_loss': 0.44352472984460517, 'epoch': 3.0})

In [None]:
train_trainer.save_model(output_dir = '/content/drive/MyDrive/Arquivos de aula/TCC/Modelos treinados/control')

In [None]:
train_trainer.evaluate(eval_dataset = tokenized_test)

{'eval_loss': 0.4239809215068817,
 'eval_accuracy': 0.8206997084548106,
 'eval_f1': 0.7026591458501208,
 'eval_precision': 0.7855855855855856,
 'eval_recall': 0.6355685131195336,
 'eval_runtime': 64.771,
 'eval_samples_per_second': 31.773,
 'eval_steps_per_second': 3.983,
 'epoch': 3.0}

##Conjunto de dados Spacy

###Importação dos datasets

In [None]:
df_spacy_syn = pd.read_csv('/content/drive/MyDrive/Arquivos de aula/TCC/Datasets/synonyms_training_augmented.csv', index_col=0)
df_spacy_ant = pd.read_csv('/content/drive/MyDrive/Arquivos de aula/TCC/Datasets/antonyms_training_augmented.csv', index_col=0)

In [None]:
df_spacy_syn

Unnamed: 0,review,polarity,aspect,start_position,end_position
0,"Quarto muito pequeno, porém a cama é confortáv...",0,limpeza,104,111
1,"O hotel fica na place de la sorbonne, com café...",-1,elevador,148,156
2,"O hotel fica na place de la sorbonne, com café...",-1,elevador,148,156
3,Hotel simples mas confortável. O grave destaqu...,1,localização,53,64
4,Hotel simples mas confortável. O importante de...,1,localização,53,64
...,...,...,...,...,...
1837,"Ficamos por 4 dias em Las Vegas e, por ser uma...",1,quarto,409,415
1838,"Ficamos por 4 dias em Las Vegas e, por ser uma...",1,quarto,409,415
1839,"Ficamos por 4 dias em Las Vegas e, por ser uma...",1,quarto,409,415
1840,"Ficamos por 4 dias em Las Vegas e, por ser uma...",1,quarto,409,415


In [None]:
df_spacy_ant

Unnamed: 0,review,polarity,aspect,start_position,end_position
0,"Quarto muito pequeno, porém a cama é confortáv...",0,limpeza,104,111
1,Atendendo aos preços em Paris tenho que dizer ...,-1,internet,319,327
2,Atendendo aos preços em Paris tenho que dizer ...,-1,internet,319,327
3,Atendendo aos preços em Paris tenho que dizer ...,-1,internet,319,327
4,Hotel simples mas confortável. O pequenino des...,-1,localização,53,64
...,...,...,...,...,...
541,"Boa localização.Quarto inconveniente, reformad...",-1,quarto,16,22
542,Reserva pela Internet muito rápida e sem compl...,-1,localização,146,157
543,Reserva pela Internet muito rápida e sem compl...,-1,localização,146,157
544,Reserva pela Internet muito rápida e sem compl...,-1,localização,146,157


###Dataset de sinônimos

####Conversão para question answering

In [None]:
spacy_syn_qa_b = create_dataframe(pd.concat([df_train_original, df_spacy_syn]), "qa-b")
spacy_syn_qa_b = Dataset.from_pandas(spacy_syn_qa_b)

####QA-B

In [None]:
tokenized_spacy_syn = spacy_syn_qa_b.map(tokenize_function, batched=True)

Map:   0%|          | 0/14859 [00:00<?, ? examples/s]

In [None]:
tokenized_spacy_labels = torch.tensor(spacy_syn_qa_b['label'])

In [None]:
spacy_syn_model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Arquivos de aula/TCC/Modelos treinados/spacy-syn").to('cuda')

In [None]:
spacy_syn_training_args = TrainingArguments(output_dir='/content/drive/MyDrive/Arquivos de aula/TCC/Treinamentos/spacy syn', evaluation_strategy='epoch')
spacy_syn_trainer = Trainer(
    # model=model,
    model=spacy_syn_model,
    args=spacy_syn_training_args,
    train_dataset=tokenized_spacy_syn,
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_spacy_syn
)

Treinamento

In [None]:
spacy_syn_trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2696,0.199304,0.934787,0.902032,0.903402,0.900666
2,0.1545,0.077658,0.980551,0.970793,0.971874,0.969715
3,0.0626,0.018853,0.995491,0.993222,0.995337,0.991116


TrainOutput(global_step=5574, training_loss=0.18850555426765608, metrics={'train_runtime': 5702.8335, 'train_samples_per_second': 7.817, 'train_steps_per_second': 0.977, 'total_flos': 1.172870151478272e+16, 'train_loss': 0.18850555426765608, 'epoch': 3.0})

In [None]:
spacy_syn_trainer.save_model(output_dir = '/content/drive/MyDrive/Arquivos de aula/TCC/Modelos treinados/spacy-syn')

In [None]:
spacy_syn_model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Arquivos de aula/TCC/Modelos treinados/spacy-syn").to('cuda')

Avaliação sobre o dataset de treino

In [None]:
spacy_syn_trainer.evaluate()

{'eval_loss': 0.018852578476071358,
 'eval_accuracy': 0.9954909482468538,
 'eval_f1': 0.9932220536165909,
 'eval_precision': 0.9953365774533658,
 'eval_recall': 0.991116495053503,
 'eval_runtime': 481.5501,
 'eval_samples_per_second': 30.857,
 'eval_steps_per_second': 3.858}

Avaliação sobre o dataset de teste

In [None]:
spacy_syn_trainer.evaluate(eval_dataset = tokenized_test)

{'eval_loss': 0.5696452856063843,
 'eval_accuracy': 0.8935860058309038,
 'eval_f1': 0.8376575240919201,
 'eval_precision': 0.8521870286576169,
 'eval_recall': 0.8236151603498543,
 'eval_runtime': 67.3489,
 'eval_samples_per_second': 30.557,
 'eval_steps_per_second': 3.831}

In [None]:
spacy_syn_trainer.evaluate(eval_dataset = tokenized_tagger_ant)

{'eval_loss': 4.49513053894043,
 'eval_accuracy': 0.45408009635651914,
 'eval_f1': 0.17252396166134185,
 'eval_precision': 0.17435424354243542,
 'eval_recall': 0.17073170731707318,
 'eval_runtime': 100.4775,
 'eval_samples_per_second': 33.052,
 'eval_steps_per_second': 4.14}

###Dataset de antônimos

####Conversão para question answering

In [None]:
spacy_ant_qa_b = create_dataframe(pd.concat([df_train_original, df_spacy_ant]), "qa-b")
spacy_ant_qa_b = Dataset.from_pandas(spacy_ant_qa_b)

####QA-B

In [None]:
tokenized_spacy_ant = spacy_ant_qa_b.map(tokenize_function, batched=True)

Map:   0%|          | 0/10971 [00:00<?, ? examples/s]

In [None]:
spacy_ant_labels = torch.tensor(spacy_ant_qa_b['label'])

In [None]:
spacy_ant_training_args = TrainingArguments(output_dir='/content/drive/MyDrive/Arquivos de aula/TCC/Treinamentos/spacy ant', evaluation_strategy='epoch')
spacy_ant_trainer = Trainer(
    model=model,
    args=spacy_ant_training_args,
    train_dataset=tokenized_spacy_ant,
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_spacy_ant
)

Treinamento

In [None]:
spacy_ant_trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5703,0.5236,0.75499,0.648719,0.621277,0.678698
2,0.4955,0.475277,0.807219,0.695902,0.733778,0.661745
3,0.4002,0.36199,0.857169,0.777951,0.807353,0.750615


TrainOutput(global_step=4116, training_loss=0.4993774934922525, metrics={'train_runtime': 4137.7401, 'train_samples_per_second': 7.954, 'train_steps_per_second': 0.995, 'total_flos': 8659774165063680.0, 'train_loss': 0.4993774934922525, 'epoch': 3.0})

In [None]:
spacy_ant_trainer.save_model(output_dir = '/content/drive/MyDrive/Arquivos de aula/TCC/Modelos treinados/spacy-ant')

Avaliação sobre o dataset de teste

In [None]:
spacy_ant_trainer.evaluate(eval_dataset = tokenized_test)

{'eval_loss': 0.38663583993911743,
 'eval_accuracy': 0.8459669582118562,
 'eval_f1': 0.7603930461073318,
 'eval_precision': 0.7896389324960753,
 'eval_recall': 0.7332361516034985,
 'eval_runtime': 65.9418,
 'eval_samples_per_second': 31.209,
 'eval_steps_per_second': 3.913,
 'epoch': 3.0}

##Conjunto de dados Albertina

###Importação dos datasets

In [None]:
df_albertina = pd.read_csv('/content/drive/MyDrive/Arquivos de aula/TCC/Datasets/synonyms_training__transformers_augmented.csv', index_col=0)

In [None]:
df_albertina

Unnamed: 0,review,polarity,aspect,start_position,end_position
0,O hotel é perto de todos os pontos principais ...,0,quarto,152,158
1,O hotel é perto de todos os pontos principais ...,0,quarto,152,158
2,O hotel é perto de todos os pontos principais ...,0,quarto,152,158
3,Viajamos eu e minha irmã. O hotel tem uma extr...,1,elevador,63,71
4,Viajamos eu e minha irmã. O hotel tem uma extr...,1,elevador,63,71
...,...,...,...,...,...
2195,a localização achei muito boa... bem localizad...,1,funcionários,256,268
2196,"Venho de chegar de minha viagem de Paris, Pass...",1,hotel,75,80
2197,"Proximo ao Metro (Republique), onde há conexõe...",1,cama,248,252
2198,"Proximo ao Metro (Republique), onde há conexõe...",1,cama,248,252


###Conversão para question answering

In [None]:
albertina_qa_b = create_dataframe(pd.concat([df_train_original, df_albertina]), "qa-b")
albertina_qa_b = Dataset.from_pandas(albertina_qa_b)

###Treinamento

In [None]:
albertina_model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Arquivos de aula/TCC/Treinamentos/albertina/checkpoint-5500").to('cuda')

In [None]:
tokenized_albertina = albertina_qa_b.map(tokenize_function, batched=True)

Map:   0%|          | 0/15933 [00:00<?, ? examples/s]

In [None]:
albertina_labels = torch.tensor(albertina_qa_b['label'])

In [None]:
albertina_training_args = TrainingArguments(output_dir='/content/drive/MyDrive/Arquivos de aula/TCC/Treinamentos/albertina', evaluation_strategy='epoch')
albertina_trainer = Trainer(
    model=albertina_model,
    args=albertina_training_args,
    train_dataset=tokenized_albertina,
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_albertina
)

In [None]:
albertina_trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3368,0.295415,0.905542,0.855774,0.87139,0.840708
2,0.2316,0.139744,0.958514,0.936994,0.948842,0.925438


In [None]:
albertina_trainer.train(resume_from_checkpoint = True)



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
3,0.1334,0.075633,0.982489,0.973642,0.977057,0.97025


TrainOutput(global_step=5976, training_loss=0.007105442414800805, metrics={'train_runtime': 875.8166, 'train_samples_per_second': 54.576, 'train_steps_per_second': 6.823, 'total_flos': 1.257644533515264e+16, 'train_loss': 0.007105442414800805, 'epoch': 3.0})

In [None]:
albertina_trainer.save_model(output_dir = '/content/drive/MyDrive/Arquivos de aula/TCC/Modelos treinados/albertina')

In [None]:
albertina_trainer.evaluate(eval_dataset = tokenized_test)

{'eval_loss': 0.5056974291801453,
 'eval_accuracy': 0.89067055393586,
 'eval_f1': 0.8349229640498901,
 'eval_precision': 0.8404726735598228,
 'eval_recall': 0.8294460641399417,
 'eval_runtime': 68.7531,
 'eval_samples_per_second': 29.933,
 'eval_steps_per_second': 3.753,
 'epoch': 3.0}

##Conjunto de dados pos tagger

###Importação dos datasets

In [None]:
df_tagger_syn = pd.read_csv('/content/drive/MyDrive/Arquivos de aula/TCC/Datasets/synonyms_training__transformers_pos_tagger_augmented.csv', index_col=0)
df_tagger_ant = pd.read_csv('/content/drive/MyDrive/Arquivos de aula/TCC/Datasets/synonyms_training__transformers_pos_tagger_augmented_antonyms.csv', index_col=0)

In [None]:
df_tagger_syn

Unnamed: 0,review,polarity,aspect,start_position,end_position
0,Viajamos eu e minha irmã. O hotel tem uma extr...,1,elevador,63,71
1,Viajamos eu e minha irmã. O hotel tem uma extr...,1,elevador,63,71
2,Viajamos eu e minha irmã. O hotel tem uma extr...,1,elevador,63,71
3,Viajamos eu e minha irmã. O hotel tem uma extr...,1,elevador,63,71
4,O Audran fica pertinho do metrô e de Sacre-cou...,1,hotel,199,204
...,...,...,...,...,...
3455,Excelente este hotel. Ficamos em um amplo quar...,1,limpeza,122,129
3456,"Apesar de sempre ler as opiniões, dizendo que ...",-1,localização,48,59
3457,"Apesar de sempre ler as opiniões, dizendo que ...",-1,localização,48,59
3458,"Apesar de sempre ler as opiniões, dizendo que ...",-1,localização,48,59


In [None]:
df_tagger_ant

Unnamed: 0,review,polarity,aspect,start_position,end_position
0,"Quarto muito pequeno, porém a cama é confortáv...",0,limpeza,104,111
1,O Distrik é um ótimo hotel. Equipe atenciosa n...,-1,quarto,66,72
2,O Distrik é um ótimo hotel. Equipe atenciosa n...,-1,quarto,66,72
3,O Distrik é um ótimo hotel. Equipe atenciosa n...,-1,quarto,66,72
4,O Distrik é um ótimo hotel. Equipe atenciosa n...,-1,quarto,66,72
...,...,...,...,...,...
1102,"Ótima localização, perto de vários pontos turí...",-1,café da manhã,403,416
1103,O hotel é charmoso e aconchegante. Tem um ótim...,-1,café da manhã,48,61
1104,O hotel tem valor acessível levando-se em cont...,1,café da manhã,232,245
1105,O hotel tem valor acessível levando-se em cont...,1,café da manhã,232,245


###Conversão para question answering

In [None]:
tagger_syn_qa_b = create_dataframe(pd.concat([df_train_original, df_tagger_syn]), "qa-b")
tagger_syn_qa_b = Dataset.from_pandas(tagger_syn_qa_b)

tagger_ant_qa_b = create_dataframe(pd.concat([df_train_original, df_tagger_ant]), "qa-b")
tagger_ant_qa_b = Dataset.from_pandas(tagger_ant_qa_b)

###Dataset de sinônimos

####Treinamento

In [None]:
tokenized_tagger_syn = tagger_syn_qa_b.map(tokenize_function, batched=True)

Map:   0%|          | 0/19713 [00:00<?, ? examples/s]

In [None]:
tagger_syn_labels = torch.tensor(tagger_syn_qa_b['label'])

In [None]:
model_tagger_syn = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Arquivos de aula/TCC/Treinamentos/tagger_syn/checkpoint-2500").to('cuda')

In [None]:
tagger_syn_training_args = TrainingArguments(output_dir='/content/drive/MyDrive/Arquivos de aula/TCC/Treinamentos/tagger_syn', evaluation_strategy='epoch')
tagger_syn_trainer = Trainer(
    model=model_tagger_syn,
    args=tagger_syn_training_args,
    train_dataset=tokenized_tagger_syn,
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_tagger_syn
)

In [None]:
tagger_syn_trainer.train()



Epoch,Training Loss,Validation Loss


In [None]:
tagger_syn_trainer.train(resume_from_checkpoint = True)



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
2,0.2011,0.152328,0.959265,0.938112,0.950344,0.926191
3,0.131,0.06419,0.984122,0.976087,0.980055,0.97215


TrainOutput(global_step=7395, training_loss=0.12056870218706421, metrics={'train_runtime': 4896.0813, 'train_samples_per_second': 12.079, 'train_steps_per_second': 1.51, 'total_flos': 1.556012470293504e+16, 'train_loss': 0.12056870218706421, 'epoch': 3.0})

In [None]:
tagger_syn_trainer.save_model(output_dir = '/content/drive/MyDrive/Arquivos de aula/TCC/Modelos treinados/tagger_syn')

In [None]:
tagger_syn_trainer.evaluate(eval_dataset = tokenized_test)

NameError: ignored

###Dataset de antônimos

####Treinamento

In [None]:
tokenized_tagger_ant = tagger_ant_qa_b.map(tokenize_function, batched=True)

Map:   0%|          | 0/12654 [00:00<?, ? examples/s]

In [None]:
tagger_ant_labels = torch.tensor(tagger_ant_qa_b['label'])

In [None]:
tagger_ant_training_args = TrainingArguments(output_dir='/content/drive/MyDrive/Arquivos de aula/TCC/Treinamentos/tagger_ant', evaluation_strategy='epoch', per_device_train_batch_size=16, learning_rate=1e-3)
tagger_ant_trainer = Trainer(
    model=model,
    args=tagger_ant_training_args,
    train_dataset=tokenized_tagger_ant,
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_tagger_ant
)

In [None]:
tagger_ant_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6933,0.641035,0.666667,0.0,0.0,0.0
2,0.6559,0.656337,0.666667,0.0,0.0,0.0
3,0.6526,0.638376,0.666667,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=2373, training_loss=0.6622225716804283, metrics={'train_runtime': 4825.95, 'train_samples_per_second': 7.866, 'train_steps_per_second': 0.492, 'total_flos': 9988221883576320.0, 'train_loss': 0.6622225716804283, 'epoch': 3.0})

In [None]:
tagger_ant_trainer.save_model(output_dir = '/content/drive/MyDrive/Arquivos de aula/TCC/Modelos treinados/tagger_ant')

In [None]:
tagger_ant_trainer.evaluate(eval_dataset = tokenized_test)

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6106414198875427,
 'eval_accuracy': 0.6666666666666666,
 'eval_f1': 0.0,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_runtime': 65.4005,
 'eval_samples_per_second': 31.468,
 'eval_steps_per_second': 3.945,
 'epoch': 3.0}