In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

import bert_functions as bf
import pandas as pd
import sys, os

# Model with best results so far
model_dir = os.path.join('models', '15000_model_10_hours')

model = BertForSequenceClassification.from_pretrained(model_dir, device_map='cpu')
tokenizer = BertTokenizer.from_pretrained(model_dir)

# Different datasets
encoded_dir = 'encoded'
loaded_datasets = bf.load_encoded_datasets(encoded_dir, bf.all_datasets)

# List with test datasets
names = ['dirty_DBLP-ACM_test', 'dirty_DBLP-GoogleScholar_test', 'dirty_iTunes-Amazon_test', \
            'structured_Amazon-Google_test', 'structured_Beer_test', 'structured_DBLP-ACM_test', \
            'structured_DBLP-ACM_test', 'structured_DBLP-GoogleScholar_test', 'structured_Fodors-Zagats_test', \
            'structured_iTunes-Amazon_test', 'structured_Walmart-Amazon_test', 'textual_Abt-Buy_test', \
]

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=['Type', 'Dataset', 'Accuracy', 'Precision', 'Recall', 'F1'])

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model, 
    args=training_args,
    compute_metrics=bf.compute_metrics,
)

# Evaluate the model on each dataset
for dataset in names:
    combined_test_dataset = loaded_datasets[dataset]
    test_dataset = bf.CustomDataset(combined_test_dataset)

    # Evaluate the model
    test_result = trainer.evaluate(eval_dataset=test_dataset)
    print(f"Results for {dataset}, {test_result}")
    
    results_df.loc[len(results_df)] = [
        dataset.split('_')[0], 
        dataset.split('_')[1], 
        test_result['eval_accuracy'], 
        test_result['eval_precision'], 
        test_result['eval_recall'], 
        test_result['eval_f1']
    ]

# Save the results to a csv file
sys.path.append("..")
results_path = os.path.join('results', 'distilbert_results.csv')
results_df.to_csv(results_path)

  from pandas.core import (


Dataset structured_Abt-Buy does not exist
Dataset textual_DBLP-ACM does not exist
Dataset textual_Amazon-Google does not exist
Dataset textual_Walmart-Amazon does not exist
Dataset textual_DBLP-GoogleScholar does not exist
Dataset textual_Fodors-Zagats does not exist
Dataset textual_Beer does not exist
Dataset textual_iTunes-Amazon does not exist
Dataset dirty_Abt-Buy does not exist
Dataset dirty_Amazon-Google does not exist
Dataset dirty_Fodors-Zagats does not exist
Dataset dirty_Beer does not exist


  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/14 [00:00<?, ?it/s]

Results for dirty_DBLP-ACM_test, {'eval_loss': 0.6117464900016785, 'eval_accuracy': 0.6914414414414415, 'eval_f1': 0.6836027713625866, 'eval_precision': 0.7014218009478673, 'eval_recall': 0.6666666666666666, 'eval_runtime': 49.0595, 'eval_samples_per_second': 18.1, 'eval_steps_per_second': 0.285}


  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/34 [00:00<?, ?it/s]

Results for dirty_DBLP-GoogleScholar_test, {'eval_loss': 0.5336354970932007, 'eval_accuracy': 0.755607476635514, 'eval_f1': 0.7503579952267303, 'eval_precision': 0.7668292682926829, 'eval_recall': 0.7345794392523365, 'eval_runtime': 108.2397, 'eval_samples_per_second': 19.771, 'eval_steps_per_second': 0.314}


  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/1 [00:00<?, ?it/s]

Results for dirty_iTunes-Amazon_test, {'eval_loss': 0.5749852657318115, 'eval_accuracy': 0.7037037037037037, 'eval_f1': 0.6521739130434783, 'eval_precision': 0.7894736842105263, 'eval_recall': 0.5555555555555556, 'eval_runtime': 2.6797, 'eval_samples_per_second': 20.151, 'eval_steps_per_second': 0.373}


  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/8 [00:00<?, ?it/s]

Results for structured_Amazon-Google_test, {'eval_loss': 0.7358266115188599, 'eval_accuracy': 0.6388888888888888, 'eval_f1': 0.6132723112128147, 'eval_precision': 0.6600985221674877, 'eval_recall': 0.5726495726495726, 'eval_runtime': 22.2319, 'eval_samples_per_second': 21.051, 'eval_steps_per_second': 0.36}


  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/1 [00:00<?, ?it/s]

Results for structured_Beer_test, {'eval_loss': 0.7366620898246765, 'eval_accuracy': 0.5833333333333334, 'eval_f1': 0.6428571428571429, 'eval_precision': 0.6428571428571429, 'eval_recall': 0.6428571428571429, 'eval_runtime': 1.1079, 'eval_samples_per_second': 21.663, 'eval_steps_per_second': 0.903}


  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/14 [00:00<?, ?it/s]

Results for structured_DBLP-ACM_test, {'eval_loss': 0.6129065752029419, 'eval_accuracy': 0.6903153153153153, 'eval_f1': 0.6871444823663254, 'eval_precision': 0.6942528735632184, 'eval_recall': 0.6801801801801802, 'eval_runtime': 41.9124, 'eval_samples_per_second': 21.187, 'eval_steps_per_second': 0.334}


  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/14 [00:00<?, ?it/s]

Results for structured_DBLP-ACM_test, {'eval_loss': 0.6129065752029419, 'eval_accuracy': 0.6903153153153153, 'eval_f1': 0.6871444823663254, 'eval_precision': 0.6942528735632184, 'eval_recall': 0.6801801801801802, 'eval_runtime': 42.7103, 'eval_samples_per_second': 20.791, 'eval_steps_per_second': 0.328}


  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/34 [00:00<?, ?it/s]

Results for structured_DBLP-GoogleScholar_test, {'eval_loss': 0.5378163456916809, 'eval_accuracy': 0.7546728971962616, 'eval_f1': 0.7534053546265852, 'eval_precision': 0.7573182247403211, 'eval_recall': 0.7495327102803738, 'eval_runtime': 105.3379, 'eval_samples_per_second': 20.316, 'eval_steps_per_second': 0.323}


  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/1 [00:00<?, ?it/s]

Results for structured_Fodors-Zagats_test, {'eval_loss': 0.8666257262229919, 'eval_accuracy': 0.4772727272727273, 'eval_f1': 0.46511627906976744, 'eval_precision': 0.47619047619047616, 'eval_recall': 0.45454545454545453, 'eval_runtime': 2.1566, 'eval_samples_per_second': 20.402, 'eval_steps_per_second': 0.464}


  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/1 [00:00<?, ?it/s]

Results for structured_iTunes-Amazon_test, {'eval_loss': 0.5279843807220459, 'eval_accuracy': 0.6666666666666666, 'eval_f1': 0.5909090909090909, 'eval_precision': 0.7647058823529411, 'eval_recall': 0.48148148148148145, 'eval_runtime': 2.6277, 'eval_samples_per_second': 20.55, 'eval_steps_per_second': 0.381}


  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/7 [00:00<?, ?it/s]

Results for structured_Walmart-Amazon_test, {'eval_loss': 0.6925813555717468, 'eval_accuracy': 0.6571428571428571, 'eval_f1': 0.6748768472906403, 'eval_precision': 0.6431924882629108, 'eval_recall': 0.7098445595854922, 'eval_runtime': 21.0403, 'eval_samples_per_second': 18.298, 'eval_steps_per_second': 0.333}


  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


  0%|          | 0/7 [00:00<?, ?it/s]

Results for textual_Abt-Buy_test, {'eval_loss': 0.7490178942680359, 'eval_accuracy': 0.5970873786407767, 'eval_f1': 0.5631578947368421, 'eval_precision': 0.6149425287356322, 'eval_recall': 0.5194174757281553, 'eval_runtime': 20.1958, 'eval_samples_per_second': 20.4, 'eval_steps_per_second': 0.347}
