In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

import bert_functions as bf
import pandas as pd
import os

# Model with best results so far
model_dir = os.path.join('models', 'size_16000_00:56:40')

model = BertForSequenceClassification.from_pretrained(model_dir, device_map='cpu')
tokenizer = BertTokenizer.from_pretrained(model_dir)

# Different datasets
encoded_dir = 'encoded'
loaded_datasets = bf.load_encoded_datasets(encoded_dir, bf.all_datasets)

# List with test datasets
names = ['dirty_DBLP-ACM_test', 'dirty_DBLP-GoogleScholar_test', 'dirty_iTunes-Amazon_test', \
            'structured_Amazon-Google_test', 'structured_Beer_test', 'structured_DBLP-ACM_test', \
            'structured_DBLP-ACM_test', 'structured_DBLP-GoogleScholar_test', 'structured_Fodors-Zagats_test', \
            'structured_iTunes-Amazon_test', 'structured_Walmart-Amazon_test', 'textual_Abt-Buy_test', \
]

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=['Type', 'Dataset', 'Accuracy', 'Precision', 'Recall', 'F1'])

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model, 
    args=training_args,
    compute_metrics=bf.compute_metrics,
)

# Evaluate the model on each dataset
for dataset in names:
    combined_test_dataset = loaded_datasets[dataset]
    test_dataset = bf.CustomDataset(combined_test_dataset)

    # Evaluate the model
    test_result = trainer.evaluate(eval_dataset=test_dataset)
    
    results_df.loc[len(results_df)] = [
        dataset.split('_')[0], 
        dataset.split('_')[1], 
        test_result['eval_accuracy'], 
        test_result['eval_precision'], 
        test_result['eval_recall'], 
        test_result['eval_f1']
    ]

# Save the results to a csv in the results folder in parent directory
parent_dir = os.path.dirname(os.getcwd())
results_path = os.path.join(parent_dir, 'results', 'bert_results.csv')
results_df.to_csv(results_path)
results_df.T

  from pandas.core import (


Dataset structured_Abt-Buy does not exist
Dataset textual_DBLP-ACM does not exist
Dataset textual_Amazon-Google does not exist
Dataset textual_Walmart-Amazon does not exist
Dataset textual_DBLP-GoogleScholar does not exist
Dataset textual_Fodors-Zagats does not exist
Dataset textual_Beer does not exist
Dataset textual_iTunes-Amazon does not exist
Dataset dirty_Abt-Buy does not exist
Dataset dirty_Amazon-Google does not exist
Dataset dirty_Fodors-Zagats does not exist
Dataset dirty_Beer does not exist


  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/14 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/34 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/1 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/8 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/1 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/14 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/14 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/34 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/1 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/1 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/7 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/7 [00:00<?, ?it/s]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Type,dirty,dirty,dirty,structured,structured,structured,structured,structured,structured,structured,structured,textual
Dataset,DBLP-ACM,DBLP-GoogleScholar,iTunes-Amazon,Amazon-Google,Beer,DBLP-ACM,DBLP-ACM,DBLP-GoogleScholar,Fodors-Zagats,iTunes-Amazon,Walmart-Amazon,Abt-Buy
Accuracy,0.707207,0.73972,0.62963,0.666667,0.708333,0.71509,0.71509,0.742056,0.522727,0.759259,0.654545,0.652913
Precision,0.712963,0.759878,0.705882,0.675676,0.733333,0.708972,0.708972,0.74761,0.514286,0.791667,0.627119,0.664921
Recall,0.693694,0.700935,0.444444,0.641026,0.785714,0.72973,0.72973,0.730841,0.818182,0.703704,0.766839,0.616505
F1,0.703196,0.729217,0.545455,0.657895,0.758621,0.719201,0.719201,0.73913,0.631579,0.745098,0.689977,0.639798
