In [3]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import pandas as pd

import sys, os, time, random
sys.path.append("..")
import config


folders = [config.STRUCTURED_DIR, config.TEXTUAL_DIR, config.DIRTY_DIR]
datasets = [config.DBLP_ACM_DIR, config.ABT_BUY_DIR, config.AMAZON_GOOGLE_DIR, \
            config.WALMART_AMAZON_DIR, config.DBLP_GOOGLESCHOLAR_DIR, config.FODORS_ZAGATS_DIR, \
                config.BEER_DIR, config.ITUNES_AMAZON_DIR]

total_preds = 0
for folder_name in folders:
    for dataset_name in datasets:
        try:
            train, val, test = config.load_datasets(folder_name, dataset_name)
            total_preds += len(train)
        except:
            # print(f"Dataset {folder_name}_{dataset_name} does not exist.")
            continue
print(f"Total training samples: {total_preds} \n")


# create the huge disctorionary to store all the data
def create_dataset_dict(tableA_df, tableB_df, ltable_id, rtable_id, label):
    return {
        "tableA_df": tableA_df, 
        "tableB_df": tableB_df, 
        "ltable_id" : ltable_id, 
        "rtable_id" : rtable_id, 
        "label" : label
    }

all_datasets = {}

for x, folder_name in enumerate(folders):
    for y, dataset_name in enumerate(datasets):
        try:
            train, val, test = config.load_datasets(folder_name, dataset_name)
            tableA_df, tableB_df = config.tableA_tableB(folder_name, dataset_name)
            the_dataset = folder_name + "_" + dataset_name

            all_datasets[the_dataset + "_train"] = create_dataset_dict(
                tableA_df, tableB_df, train['ltable_id'], train['rtable_id'], train['label']
            )
            all_datasets[the_dataset + "_val"] = create_dataset_dict(
                tableA_df, tableB_df, val['ltable_id'], val['rtable_id'], val['label']
            )
            all_datasets[the_dataset + "_test"] = create_dataset_dict(
                tableA_df, tableB_df, test['ltable_id'], test['rtable_id'], test['label']
            )

        except Exception as e:
            print(f"Dataset {folder_name}_{dataset_name} does not exist.")
            continue

# Preprocess the dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_function(dataset):
    # Initialize lists to store the tokenized text data and labels
    tokenized_inputs = []
    labels = []
    # Iterate over the rows in the ltable_id and rtable_id Series
    for l_id, r_id, label in zip(dataset['ltable_id'], dataset['rtable_id'], dataset['label']):
        # Map the identifiers to their corresponding text data
        entity1 = dataset['tableA_df'].loc[l_id]
        entity2 = dataset['tableB_df'].loc[r_id]
        # Drop id
        entity1 = entity1.drop('id')
        entity2 = entity2.drop('id')
        # Join the text data into a single string with column names as separators
        entity1 = ' '.join(f'{col}: {val}' for col, val in entity1.items())
        entity2 = ' '.join(f'{col}: {val}' for col, val in entity2.items())
        # Tokenize the text data and add it to the list
        tokenized_inputs.append(tokenizer(entity1, entity2, truncation=True, padding='max_length', max_length=512))
        # Process the label and add it to the list
        labels.append(torch.tensor(label))
    # Return the tokenized inputs and the labels
    return {'input_ids': [ti['input_ids'] for ti in tokenized_inputs], 'attention_mask': [ti['attention_mask'] for ti in tokenized_inputs], 'labels': labels}

# Preprocessing to each dataset that takes 2 minutes
"""
encoded_datasets = {}
for dataset_name, dataset in all_datasets.items():
    encoded_datasets[dataset_name] = preprocess_function(dataset)
    torch.save(encoded_datasets[dataset_name], f"encoded/{dataset_name}.pt")
"""

# List of dataset names
loaded_datasets = {}

# Load the encoded datasets, 17 seconds instead of 2 minutes
for dataset_name, dataset in all_datasets.items():
    loaded_datasets[dataset_name] = torch.load(f'encoded/{dataset_name}.pt')
    
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data, size=None):
        self.data = data
        self.size = size if size is not None else len(self.data[list(self.data.keys())[0]])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}
        return item

    def __len__(self):
        return self.size

# Convert the loaded datasets into Dataset objects
for dataset_name, dataset in loaded_datasets.items():
    loaded_datasets[dataset_name] = CustomDataset(dataset)

# Load the pre-trained model
model = BertForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased', num_labels=2)
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

def combine_datasets(loaded_datasets, suffix):
    combined_data = {key: [] for key in loaded_datasets[list(loaded_datasets.keys())[0]].data.keys()}
    for dataset_name, dataset in loaded_datasets.items():
        if dataset_name.endswith(suffix):
            for key in combined_data.keys():
                combined_data[key].extend(dataset.data[key])
    return CustomDataset(combined_data)

# Use the function to combine the training and test datasets
# combined_train_dataset = combine_datasets(loaded_datasets, "_train")
# combined_val_dataset = combine_datasets(loaded_datasets, "_val")
# combined_test_dataset = combine_datasets(loaded_datasets, "_test")

# Define a smaller size for the training dataset
train_size = 100  # Adjust this value based on your needs
# Create a smaller training dataset
small_train_dataset = CustomDataset(loaded_datasets['structured_DBLP-ACM_train.pt'].data, size=train_size)
small_val_dataset = CustomDataset(loaded_datasets['structured_DBLP-ACM_val.pt'].data, size=train_size)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# format current time as a string with HH:MM:SS
time_now = time.strftime("%H:%M:%S", time.localtime())

# save the model
trainer.save_model(f"models/train_size_{train_size}_{time_now}")

# Test the model
test_result = trainer.evaluate(eval_dataset=loaded_datasets['structured_DBLP-ACM_test.pt'].data)

# Print the test results
print(f"Test Results: {test_result}")



Total training samples: 75662 

Dataset structured_Abt-Buy does not exist.
Dataset textual_DBLP-ACM does not exist.
Dataset textual_Amazon-Google does not exist.
Dataset textual_Walmart-Amazon does not exist.
Dataset textual_DBLP-GoogleScholar does not exist.
Dataset textual_Fodors-Zagats does not exist.
Dataset textual_Beer does not exist.
Dataset textual_iTunes-Amazon does not exist.
Dataset dirty_Abt-Buy does not exist.
Dataset dirty_Amazon-Google does not exist.
Dataset dirty_Fodors-Zagats does not exist.
Dataset dirty_Beer does not exist.


You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.we

KeyError: 'structured_DBLP_ACM_train.pt'