In [24]:
import os, sys, time, torch, pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Append the configuration path
sys.path.append("..")
import config

# Load configuration
folders = [config.STRUCTURED_DIR, config.TEXTUAL_DIR, config.DIRTY_DIR]
datasets = [
    config.DBLP_ACM_DIR, config.ABT_BUY_DIR, config.AMAZON_GOOGLE_DIR,
    config.WALMART_AMAZON_DIR, config.DBLP_GOOGLESCHOLAR_DIR,
    config.FODORS_ZAGATS_DIR, config.BEER_DIR, config.ITUNES_AMAZON_DIR
]

class CustomDataset(Dataset):
    def __init__(self, data, size=None):
        self.data = data
        self.size = size if size is not None else len(self.data[list(self.data.keys())[0]])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}
        return item

    def __len__(self):
        return self.size

def count_training_samples(folders, datasets):
    total_preds = 0
    for folder_name in folders:
        for dataset_name in datasets:
            try:
                train, _, _ = config.load_datasets(folder_name, dataset_name)
                total_preds += len(train)
            except:
                print(f"Dataset {folder_name}_{dataset_name} does not exist")
                continue
    print(f"Total training samples: {total_preds}\n")
    return total_preds

def create_dataset_dict(tableA_df, tableB_df, ltable_id, rtable_id, label):
    return {
        "tableA_df": tableA_df, 
        "tableB_df": tableB_df, 
        "ltable_id": ltable_id, 
        "rtable_id": rtable_id, 
        "label": label
    }

def load_and_prepare_datasets(folders, datasets):
    all_datasets = {}
    for folder_name in folders:
        for dataset_name in datasets:
            try:
                train, val, test = config.load_datasets(folder_name, dataset_name)
                tableA_df, tableB_df = config.tableA_tableB(folder_name, dataset_name)
                the_dataset = f"{folder_name}_{dataset_name}"

                all_datasets[f"{the_dataset}_train"] = create_dataset_dict(
                    tableA_df, tableB_df, train['ltable_id'], train['rtable_id'], train['label']
                )
                all_datasets[f"{the_dataset}_val"] = create_dataset_dict(
                    tableA_df, tableB_df, val['ltable_id'], val['rtable_id'], val['label']
                )
                all_datasets[f"{the_dataset}_test"] = create_dataset_dict(
                    tableA_df, tableB_df, test['ltable_id'], test['rtable_id'], test['label']
                )
            except:
                print(f"Dataset {folder_name}_{dataset_name} does not exist")
                continue
    return all_datasets

def preprocess_function(dataset, tokenizer):
    tokenized_inputs = []
    labels = []
    total_count_0 = sum(label == 0 for label in dataset['label'])
    total_count_1 = sum(label == 1 for label in dataset['label'])
    count_0, count_1 = 0, 0
    for l_id, r_id, label in zip(dataset['ltable_id'], dataset['rtable_id'], dataset['label']):
        # If the label is 0 (majority class) and we have already added enough samples of this class, skip this sample
        if label == 0 and count_0 >= total_count_1:
            continue
        entity1 = dataset['tableA_df'].loc[l_id].drop('id')
        entity2 = dataset['tableB_df'].loc[r_id].drop('id')
        entity1 = ' '.join(f'{col}: {val}' for col, val in entity1.items())
        entity2 = ' '.join(f'{col}: {val}' for col, val in entity2.items())
        tokenized_inputs.append(tokenizer(entity1, entity2, truncation=True, padding='max_length', max_length=512))
        labels.append(torch.tensor(label))
        # Update the counts
        if label == 0:
            count_0 += 1
        else:
            count_1 += 1
    return {
        'input_ids': [ti['input_ids'] for ti in tokenized_inputs],
        'attention_mask': [ti['attention_mask'] for ti in tokenized_inputs],
        'labels': labels
    }

def load_encoded_datasets(encoded_dir, all_datasets):
    loaded_datasets = {}
    for dataset_name in all_datasets.keys():
        try:
            loaded_datasets[dataset_name] = torch.load(os.path.join(encoded_dir, f"{dataset_name}.pt"))
        except Exception as e:
            print(f"Failed to load dataset {dataset_name}")
    return loaded_datasets

def combine_datasets(loaded_datasets, suffix):
    combined_data = {key: [] for key in loaded_datasets[list(loaded_datasets.keys())[0]].data.keys()}
    for dataset_name, dataset in loaded_datasets.items():
        if dataset_name.endswith(suffix):
            for key in combined_data.keys():
                combined_data[key].extend(dataset.data[key])
    return CustomDataset(combined_data)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


count_training_samples(folders, datasets)
all_datasets = load_and_prepare_datasets(folders, datasets)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Uncomment to preprocess datasets
# encoded_datasets = {name: preprocess_function(data, tokenizer) for name, data in all_datasets.items()}
# for name, dataset in encoded_datasets.items():
#     torch.save(dataset, f"encoded/{name}.pt")

device = torch.device('cpu')

encoded_dir = 'encoded'
loaded_datasets = load_encoded_datasets(encoded_dir, all_datasets)

for dataset_name, dataset in loaded_datasets.items():
    loaded_datasets[dataset_name] = CustomDataset(dataset)

combined_train_dataset = combine_datasets(loaded_datasets, "_train")
combined_val_dataset = combine_datasets(loaded_datasets, "_val")
combined_test_dataset = combine_datasets(loaded_datasets, "_test")

train_size = 1000
train_dataset = CustomDataset(combined_train_dataset.data, size=train_size)
val_dataset = CustomDataset(combined_val_dataset.data, size=train_size)
test_dataset = CustomDataset(combined_test_dataset.data, size=train_size)

model = BertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    learning_rate=1e-4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

time_now = time.strftime("%H:%M:%S", time.localtime())
output_dir = f"models/combined_{train_size}_{time_now}"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

test_result = trainer.evaluate(eval_dataset=test_dataset)
test_result = pd.DataFrame(test_result, index=[0])
print(test_result.T)

Dataset structured_Abt-Buy does not exist
Dataset textual_DBLP-ACM does not exist
Dataset textual_Amazon-Google does not exist
Dataset textual_Walmart-Amazon does not exist
Dataset textual_DBLP-GoogleScholar does not exist
Dataset textual_Fodors-Zagats does not exist
Dataset textual_Beer does not exist
Dataset textual_iTunes-Amazon does not exist
Dataset dirty_Abt-Buy does not exist
Dataset dirty_Amazon-Google does not exist
Dataset dirty_Fodors-Zagats does not exist
Dataset dirty_Beer does not exist
Total training samples: 75662

Dataset structured_Abt-Buy does not exist
Dataset textual_DBLP-ACM does not exist
Dataset textual_Amazon-Google does not exist
Dataset textual_Walmart-Amazon does not exist
Dataset textual_DBLP-GoogleScholar does not exist
Dataset textual_Fodors-Zagats does not exist
Dataset textual_Beer does not exist
Dataset textual_iTunes-Amazon does not exist
Dataset dirty_Abt-Buy does not exist
Dataset dirty_Amazon-Google does not exist
Dataset dirty_Fodors-Zagats does n

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'enc

  0%|          | 0/189 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

# Specify the directory where the model is saved
model_dir = 'models/combined_10000_23:22:43'

# Load the model
model = BertForSequenceClassification.from_pretrained(model_dir, device_map='cpu')
tokenizer = BertTokenizer.from_pretrained(model_dir)

trainer = Trainer(model=model, compute_metrics=compute_metrics)
small_test_dataset = CustomDataset(combined_test_dataset.data)

# Evaluate the model
test_result = trainer.evaluate(eval_dataset=small_test_dataset)

predictions = trainer.predict(small_test_dataset)
probabilities = torch.nn.functional.softmax(torch.from_numpy(predictions.predictions), dim=-1)
predicted_classes = torch.argmax(probabilities, dim=-1)
print(predicted_classes)


# Convert test results to a pandas dataframe
test_result = pd.DataFrame(test_result, index=[0])

# # Print the test results
test_result.T